Source code for scoutbot.agg

# -*- coding: utf-8 -*-
"""Aggregation (AGG) returns unified detects for an image given its individual tile detections

This module defines how the tile-base localization detection results are aggregated
at the image level.  This includes the ability to weight the importance of detections
along the border of each tile within an image, and performing non-maximum suppression (NMS)
on the combined results.
"""
import os

import numpy as np
import utool as ut

from scoutbot import log

MARGIN = 32.0

DEFAULT_CONFIG = os.getenv('AGG_CONFIG', os.getenv('CONFIG', 'mvp')).strip().lower()
CONFIGS = {
    'phase1': {
        'thresh': 0.5077,
        'nms': 0.8,
    },
    'mvp': {
        'thresh': 0.0,  # Disabled: pending validation
        'nms': 0.8,
    },
}
CONFIGS[None] = CONFIGS[DEFAULT_CONFIG]
CONFIGS['old'] = CONFIGS['phase1']
CONFIGS['new'] = CONFIGS['mvp']
assert DEFAULT_CONFIG in CONFIGS


[docs]def iou(box1, box2):
    """
    Computes the IoU (Intersection over Union) ratio for two bounding boxes.

    Each box dictionary must have a structure with the following keys:

        ::

            {
                'xtl': x_top_left (int)
                'ytl': y_top_left (int)
                'xbr': x_bottom_right (int)
                'ybr': y_bottom_right (int)
            }

    The ``(xtl, ytl)`` coordinate is the top-left corner of the box.

    The ``(xbr, ybr)`` coordinate is the opposite bottom-right corner of the box.

    The order of the boxes does not impact the calculation of the intersection
    and union values.

    Args:
        box1 (dict): a dictionary of the first bounding box's dimensions
        box2 (dict): a dictionary of the second bounding box's dimensions

    Returns:
        tuple ( int, int, int, int ):
            - the pixel area of the first box
            - the pixel area of the second box
            - the pixel area of the intersection (overlapping area) between the boxes
            - the pixel area of the union (combined area) between the boxes
    """
    inter_xtl = max(box1['xtl'], box2['xtl'])
    inter_ytl = max(box1['ytl'], box2['ytl'])
    inter_xbr = min(box1['xbr'], box2['xbr'])
    inter_ybr = min(box1['ybr'], box2['ybr'])

    inter_w = inter_xbr - inter_xtl
    inter_h = inter_ybr - inter_ytl

    if inter_w <= 0 or inter_h <= 0:
        inter = 0.0
    else:
        inter_w = max(0.0, inter_xbr - inter_xtl)
        inter_h = max(0.0, inter_ybr - inter_ytl)
        inter = inter_w * inter_h

    area1 = box1['w'] * box1['h']
    area2 = box2['w'] * box2['h']

    union = area1 + area2 - inter

    return area1, area2, inter, union


[docs]def demosaic(img_shape, tile_grids, loc_outputs, margin=MARGIN):
    """
    Demosaics a list of tiles and their respective detections back into the original
    image's coordinate system.

    Args:
        img_shape (tuple): a tuple of the image shape as ``h, w, c`` or ``h, w``
        tile_grids (list of dict): a list of tile coordinates
        loc_output (list of list of dict): the output predictions from the Localizer.
        margin (float, optional): the margin of the image to weight predictions.
            Defaults to 32.0

    Returns:
        list ( dict ): list of Localizer predictions
    """
    assert len(tile_grids) == len(loc_outputs)

    img_h, img_w = img_shape[:2]

    detects = []
    for tile_grid, loc_output in zip(tile_grids, loc_outputs):

        tile_xtl = tile_grid['x']
        tile_ytl = tile_grid['y']
        tile_w = tile_grid['w']
        tile_h = tile_grid['h']

        for detect in loc_output:
            detect_xtl = detect['x']
            detect_ytl = detect['y']
            detect_w = detect['w']
            detect_h = detect['h']
            detect_conf = detect['c']
            detect_label = detect['l']

            detect_xbr = detect_xtl + detect_w
            detect_ybr = detect_ytl + detect_h

            detect_box = {
                'xtl': detect_xtl / tile_w,
                'ytl': detect_ytl / tile_h,
                'xbr': detect_xbr / tile_w,
                'ybr': detect_ybr / tile_h,
                'w': detect_w / tile_w,
                'h': detect_h / tile_h,
            }

            margin_percent_w = margin / tile_w
            margin_percent_h = margin / tile_h

            center_box = {
                'xtl': margin_percent_w,
                'ytl': margin_percent_h,
                'xbr': 1.0 - margin_percent_w,
                'ybr': 1.0 - margin_percent_h,
                'w': 1.0 - (2.0 * margin_percent_w),
                'h': 1.0 - (2.0 * margin_percent_h),
            }
            area, _, inter, union = iou(detect_box, center_box)

            overlap = 0.0 if area <= 0 else inter / area
            overlap = round(overlap, 8)
            assert 0.0 <= overlap and overlap <= 1.0
            multiplier = np.sqrt(overlap)

            final_conf = round(detect_conf * multiplier, 4)
            if final_conf <= 0.0:
                continue

            final_xtl = int(np.around(tile_xtl + detect_xtl))
            final_ytl = int(np.around(tile_ytl + detect_ytl))
            final_w = int(np.around(detect_w))
            final_h = int(np.around(detect_h))
            final_xbr = final_xtl + final_w
            final_ybr = final_ytl + final_h

            # Check size with image frame
            final_xtl = min(max(final_xtl, 0), img_w)
            final_ytl = min(max(final_ytl, 0), img_h)
            final_xbr = min(max(final_xbr, 0), img_w)
            final_ybr = min(max(final_ybr, 0), img_h)
            final_w = final_xbr - final_xtl
            final_h = final_ybr - final_ytl

            final_area = final_w * final_h
            if final_area <= 0.0:
                continue

            detects.append(
                {
                    'l': detect_label,
                    'c': final_conf,
                    'x': final_xtl,
                    'y': final_ytl,
                    'w': final_w,
                    'h': final_h,
                }
            )

    return detects


[docs]def compute(
    img_shape, tile_grids, loc_outputs, config=None, agg_thresh=None, nms_thresh=None
):
    """
    Compute the aggregated image-level detection results for a given list of tile-level detections.

    Args:
        img_shape (tuple): a tuple of the image shape as ``h, w, c`` or ``h, w``
        tile_grids (list of dict): a list of tile coordinates
        loc_output (list of list of dict): the output predictions from the Localizer.
        config (str or None, optional): the configuration to use, one of ``phase1``
            or ``mvp``.  Defaults to :obj:`None`.
        agg_thresh (float or None, optional): the confidence threshold for the aggregated
            localizer predictions.  Defaults to None.  Defaults to :obj:`None`.
        nms_thresh (float or None, optional): the non-maximum suppression (NMS) threshold
            for the aggregated localizer's predictions.  Defaults to :obj:`None`.

    Returns:
        list ( dict ): list of Localizer predictions
    """
    from scoutbot.agg.py_cpu_nms import py_cpu_nms

    assert len(tile_grids) == len(loc_outputs)

    if config is None:
        config = DEFAULT_CONFIG
    if agg_thresh is None:
        agg_thresh = CONFIGS[config]['thresh']
    if nms_thresh is None:
        nms_thresh = CONFIGS[config]['nms']

    log.debug(f'Aggregating {len(tile_grids)} tiles onto {img_shape} canvas')

    if len(tile_grids) == 0:
        final = []
    else:
        # Demosaic tile detection results and aggregate across the image
        detects = demosaic(img_shape, tile_grids, loc_outputs)

        # Filter low-confidence detections
        detects = [detect for detect in detects if detect['c'] >= agg_thresh]

        if len(detects) == 0:
            final = []
        else:
            # Run NMS on aggregated detections
            coords = np.vstack(
                [
                    [
                        detect['x'],
                        detect['y'],
                        detect['x'] + detect['w'],
                        detect['y'] + detect['h'],
                    ]
                    for detect in detects
                ]
            )
            confs = np.array([detect['c'] for detect in detects])

            keeps = py_cpu_nms(coords, confs, 1.0 - nms_thresh)
            final = ut.take(detects, keeps)
            final.sort(key=lambda val: val['c'], reverse=True)

    log.debug(f'Found {len(final)} detections')

    return final