Source code for sparseml.pytorch.utils.yolo_helpers

# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Helper functions and classes for creating and training PyTorch Yolo models
"""


from typing import Iterable, List, Tuple

import torch
from torch import Tensor


try:
    from torchvision.ops.boxes import batched_nms
except Exception:
    batched_nms = None


all = [
    "get_output_grid_shapes",
    "yolo_v3_anchor_groups",
    "build_targets",
    "box_giou",
    "YoloGrids",
    "postprocess_yolo",
]


[docs]def get_output_grid_shapes(outputs: List[Tensor]) -> List[Tensor]: """ :param outputs: List of Yolo model outputs :return: A list of the grid dimensions for each of the Yolo outputs """ return [Tensor(list(output.shape[2:4])) for output in outputs]
[docs]def yolo_v3_anchor_groups() -> List[Tensor]: """ :return: List of the default anchor coordinate groups for Yolo V3 outputs """ return [ Tensor([[116, 90], [156, 198], [373, 326]]), Tensor([[30, 61], [62, 45], [59, 119]]), Tensor([[10, 13], [16, 30], [33, 23]]), ]
def _width_height_iou(wh_a: Tensor, wh_b: Tensor) -> Tensor: # [n,2], [m,2] -> [n,m] wh_a = wh_a.unsqueeze(1) wh_b = wh_b.unsqueeze(0) area_a = wh_a.prod(2) area_b = wh_b.prod(2) intersection = torch.min(wh_a, wh_b).prod(2) return intersection / (area_a + area_b - intersection)
[docs]def build_targets( targets: Tensor, anchors_groups: List[Tensor], grid_shapes: List[Tensor], iou_threshold: float = 0.2, ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]: """ Returns a representation of the image targets according to the given anchor groups and grid shapes. :param targets: Yolo data targets tensor of shape n,6 with columns image number, class, center_x, center_y, width, height :param anchors_groups: List of n,2 Tensors of anchor point coordinates for each of the Yolo model's detectors :param grid_shapes: List of n,2 Tensors of the Yolo models output grid shapes for a particular input shape :param iou_threshold: the minimum IoU value to consider an object box to match to an anchor point. Default is 0.2 :return: """ num_targets = targets.shape[0] num_anchors = len(anchors_groups[0]) classes, boxes, indices, target_anchors = [], [], [], [] # copy targets for each anchor anchor_indices = ( torch.arange(num_anchors, device=targets.device) .float() .view(num_anchors, 1) .repeat(1, num_targets) ) targets = torch.cat( (targets.repeat(num_anchors, 1, 1), anchor_indices[:, :, None]), 2 ) offset_bias = 0.5 offset_values = ( torch.tensor( [[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]], # None, j,k,l,m device=targets.device, ).float() * offset_bias ) # offsets grid_scale = torch.ones(7, device=targets.device) # tensor for grid space scaling for idx, anchors in enumerate(anchors_groups): # scale targets to current grid anchors = anchors.to(targets.device) grid_scale[2:6] = torch.tensor(grid_shapes[idx])[[0, 1, 0, 1]] scaled_targets = targets * grid_scale if num_targets: # mask non-matches wh_iou_mask = ( _width_height_iou(anchors, scaled_targets[0, :, 4:6]) > iou_threshold ) scaled_targets = scaled_targets[wh_iou_mask] # adjust for offsets for grid index rounding targets_xy = scaled_targets[:, 2:4] targets_xy_inv = grid_scale[[2, 3]] - targets_xy j, k = ((targets_xy % 1.0 < offset_bias) & (targets_xy > 1.0)).t() l, m = ((targets_xy_inv % 1.0 < offset_bias) & (targets_xy_inv > 1.0)).t() offset_filter = torch.stack((torch.ones_like(j), j, k, l, m)) scaled_targets = scaled_targets.repeat((5, 1, 1))[offset_filter] offsets = (torch.zeros_like(targets_xy)[None] + offset_values[:, None])[ offset_filter ] else: scaled_targets = targets[0] offsets = 0 # extract fields image, clazz = scaled_targets[:, :2].long().t() targets_xy = scaled_targets[:, 2:4] targets_wh = scaled_targets[:, 4:6] grid_indices = (targets_xy - offsets).long() grid_x, grid_y = grid_indices.t() anchor_idxs = scaled_targets[:, 6].long() indices.append((image, anchor_idxs, grid_x, grid_y)) boxes.append(torch.cat((targets_xy - grid_indices.float(), targets_wh), 1)) target_anchors.append(anchors[anchor_idxs]) classes.append(clazz) return classes, boxes, indices, target_anchors
[docs]def box_giou(boxes_a: Tensor, boxes_b: Tensor) -> Tensor: """ :param boxes_a: 4,N Tensor of xywh bounding boxes :param boxes_b: 4,N Tensor of xywh bounding boxes :return: Shape N Tensor of GIoU values between boxes in the input tensors """ # get ltrb coordinates lt_x_a = boxes_a[0] - boxes_a[2] / 2.0 lt_y_a = boxes_a[1] - boxes_a[3] / 2.0 rb_x_a = boxes_a[0] + boxes_a[2] / 2.0 rb_y_a = boxes_a[1] + boxes_a[3] / 2.0 lt_x_b = boxes_b[0] - boxes_b[2] / 2.0 lt_y_b = boxes_b[1] - boxes_b[3] / 2.0 rb_x_b = boxes_b[0] + boxes_b[2] / 2.0 rb_y_b = boxes_b[1] + boxes_b[3] / 2.0 # base IoU inter = (torch.min(rb_x_a, rb_x_b) - torch.max(lt_x_a, lt_x_b)).clamp(0) * ( torch.min(rb_y_a, rb_y_b) - torch.max(lt_y_a, lt_y_b) ).clamp(0) area_a = boxes_a[2] * boxes_a[3] area_b = boxes_b[2] * boxes_b[3] union = area_a + area_b - inter + 1e-9 iou = inter / union # convex area convex_w = torch.max(rb_x_a, rb_x_b) - torch.min(lt_x_a, lt_x_b) convex_h = torch.max(rb_y_a, rb_y_b) - torch.min(lt_y_a, lt_y_b) convex_area = convex_w * convex_h + 1e-9 # convex area return iou - (convex_area - union) / convex_area # GIoU
[docs]class YoloGrids(object): """ Helper class to compute and store Yolo output and anchor box grids :param anchor_groups: List of n,2 tensors of the Yolo model's anchor points for each output group. Defaults to yolo_v3_anchor_groups """ def __init__(self, anchor_groups: List[Tensor] = None): self._grids = {} anchor_groups = anchor_groups or yolo_v3_anchor_groups() self._anchor_grids = [ t.clone().view(1, -1, 1, 1, 2) for t in yolo_v3_anchor_groups() ]
[docs] def get_grid(self, size_x: int, size_y: int) -> Tensor: """ :param size_x: grid size x :param size_y: grid size y :return: Yolo output box grid for size x,y to be used for model output decoding. will have shape (1, 1, size_y, size_x, 2) """ grid_shape = (size_x, size_y) if grid_shape not in self._grids: coords_y, coords_x = torch.meshgrid( [torch.arange(size_y), torch.arange(size_x)] ) grid = torch.stack((coords_x, coords_y), 2) self._grids[grid_shape] = grid.view(1, 1, size_y, size_x, 2) return self._grids[grid_shape]
[docs] def get_anchor_grid(self, group_idx: int) -> Tensor: """ :param group_idx: Index of output group for this anchor grid :return: grid tensor of shape 1, num_anchors, 1, 1, 2 """ return self._anchor_grids[group_idx]
[docs] def num_anchor_grids(self) -> int: """ :return: The number of anchor grids available (number of yolo model outputs) """ return len(self._anchor_grids)
def _xywh_to_ltrb(boxes, in_place: bool = False): if not in_place: boxes = boxes.clone() boxes[:, 0], boxes[:, 2] = ( # ltrb x boxes[:, 0] - boxes[:, 2] / 2.0, boxes[:, 0] + boxes[:, 2] / 2.0, ) boxes[:, 1], boxes[:, 3] = ( # ltrb y boxes[:, 1] - boxes[:, 3] / 2.0, boxes[:, 1] + boxes[:, 3] / 2.0, ) return boxes
[docs]def postprocess_yolo( preds: List[Tensor], input_shape: Iterable[int], yolo_grids: YoloGrids = None, confidence_threshold: float = 0.1, iou_threshold: float = 0.6, max_detections: int = 300, ) -> List[Tuple[Tensor, Tensor, Tensor]]: """ Decode the outputs of a Yolo model and perform non maximum suppression on the predicted boxes. :param preds: list of Yolo model output tensors :param input_shape: shape of input image to model. Default is [640, 640] :param yolo_grids: optional YoloGrids object for caching previously used grid shapes :param confidence_threshold: minimum confidence score for a prediction to be considered a detection. Default is 0.1 :param iou_threshold: IoU threshold for non maximum suppression. Default is 0.6 :param max_detections: maximum number of detections after nms. Default is 300 :return: List of predicted bounding boxes (n,4), labels, and scores for each output in the batch """ if batched_nms is None: raise RuntimeError( "Unable to import batched_nms from torchvision.ops try upgrading your" " torch and torchvision versions" ) yolo_grids = yolo_grids or YoloGrids() # decode each of the model output grids then concatenate outputs = [] for idx, pred in enumerate(preds): pred = pred.sigmoid() # build grid and calculate stride grid_shape = pred.shape[2:4] grid = yolo_grids.get_grid(*grid_shape) anchor_grid = yolo_grids.get_anchor_grid(idx) stride = input_shape[0] / grid_shape[0] # decode xywh box values pred[..., 0:2] = (pred[..., 0:2] * 2.0 - 0.5 + grid) * stride pred[..., 2:4] = (pred[..., 2:4] * 2) ** 2 * anchor_grid # flatten anchor and grid dimensions -> (bs, num_predictions, num_classes + 5) outputs.append(pred.view(pred.size(0), -1, pred.size(-1))) outputs = torch.cat(outputs, 1) # perform nms on each image in batch nms_outputs = [] for image_idx, output in enumerate(outputs): # filter out low confidence predictions confidence_mask = output[..., 4] > confidence_threshold output = output[confidence_mask] if output.size(0) == 0: # no predictions, return empty tensor nms_outputs.append(torch.empty(0, 6)) continue # scale class confidences by object confidence, convert to ltrb output[:, 5:] *= output[:, 4:5] _xywh_to_ltrb(output[:, :4], in_place=True) # attach labels of all positive predictions class_confidence_mask = output[:, 5:] > confidence_threshold pred_idxs, class_idxs = class_confidence_mask.nonzero(as_tuple=False).t() output = torch.cat( [ output[pred_idxs, :4], output[pred_idxs, class_idxs + 5].unsqueeze(-1), class_idxs.float().unsqueeze(-1), ], 1, ) if output.size(0) == 0: # no predictions, return empty tensor nms_outputs.append(torch.empty(0, 6)) continue # run nms nms_filter = batched_nms( # boxes, scores, labels, threshold output[:, :4], output[:, 4], output[:, 5], iou_threshold ) if nms_filter.size(0) > max_detections: nms_filter = nms_filter[:max_detections] output = output[nms_filter] # extract outputs, rescale boxes to [0, 1] boxes = output[:, :4] boxes[:, [0, 2]] /= input_shape[0] # scale x boxes[:, [1, 3]] /= input_shape[1] # scale y labels = output[:, 5].long() scores = output[:, 4] nms_outputs.append((boxes, labels, scores)) return nms_outputs