Source code for milliontrees.datasets.TreeBoxes

from pathlib import Path
import os
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image
import torch
import albumentations as A
import torchvision.transforms as T
import fnmatch

from milliontrees.datasets.milliontrees_dataset import MillionTreesDataset
from milliontrees.common.eval_visualization import save_eval_visualizations
from milliontrees.common.grouper import CombinatorialGrouper
from milliontrees.common.metrics.all_metrics import (
    CountingError,
    DetectionAccuracy,
    DetectionMAP,
    MaskAwareDetectionPrecision,
    MergeCommissionMetric,
)
from milliontrees.common.onboarding import print_dataset_summary
from PIL import Image

from albumentations.pytorch import ToTensorV2


[docs] class TreeBoxesDataset(MillionTreesDataset): """A dataset of tree annotations with bounding box coordinates from multiple global sources. The dataset contains aerial imagery of trees with their corresponding bounding box annotations. Each tree is annotated with a 4-point bounding box (x_min, y_min, x_max, y_max). Dataset Splits: - random: For each source, a portion of images is in train and a portion in test. - crossgeometry: Boxes and Points are used to predict polygons. - zeroshot: Selected sources are entirely held out for testing. Data Format: Input (x): RGB aerial imagery Labels (y): Nx4 array of bounding box coordinates Metadata: Location identifiers for each image Args: version (str): The version of the dataset to load. root_dir (str): The root directory to store the dataset. download (bool): Whether to download the dataset if it is not already present. split_scheme (str): The split scheme to use. geometry_name (str): The name of the geometry to use. eval_score_threshold (float): The threshold for the evaluation score. remove_incomplete (bool): Whether to remove incomplete data. image_size (int): The size of the image to use. include_sources (list): The sources to include. exclude_sources (list): The sources to exclude. unsupervised (bool): If True, include unsupervised data in addition to any other selected sources (unless explicitly excluded). mini (bool): If True, download mini versions of datasets for development. Mini datasets are smaller subsets that maintain the same structure. small (bool): If True, download small releases (up to 50 images per source). unsupervised_args (dict): The arguments to pass to the unsupervised download pipeline. References: Website: https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009180 Citation: @article{Weinstein2020, title={A benchmark dataset for canopy crown detection and delineation in co-registered airborne RGB, LiDAR and hyperspectral imagery from the National Ecological Observation Network.}, author={Weinstein BG, Graves SJ, Marconi S, Singh A, Zare A, Stewart D, et al.}, journal={PLoS Comput Biol}, year={2021}, doi={10.1371/journal.pcbi.1009180} } License: Creative Commons Attribution License """ _dataset_name = 'TreeBoxes' _versions_dict = { # 0.0 is a placeholder for the testing dataset '0.0': { 'download_url': '', 'supervised_download_url': '', 'compressed_size': 105525592 }, "0.17": { 'download_url': "https://data.rc.ufl.edu/pub/ewhite/MillionTrees/TreeBoxes_v0.17.zip", 'supervised_download_url': "https://data.rc.ufl.edu/pub/ewhite/MillionTrees/TreeBoxes_supervised_v0.17.zip", 'compressed_size': 50996758836 }, "0.18": { 'download_url': "https://data.rc.ufl.edu/pub/ewhite/MillionTrees/TreeBoxes_v0.18.zip", 'supervised_download_url': "https://data.rc.ufl.edu/pub/ewhite/MillionTrees/TreeBoxes_supervised_v0.18.zip", 'compressed_size': 67700616443 } } def __init__(self, version=None, root_dir='data', download=False, split_scheme='random', geometry_name='y', eval_score_threshold=0.1, remove_incomplete=False, image_size=448, include_sources=None, exclude_sources=None, mini=False, small=False, verbose=True, include_unsupervised=False): if mini and small: raise ValueError( 'At most one of mini=True and small=True may be set.') self._version = version self._split_scheme = split_scheme self.geometry_name = geometry_name self.eval_score_threshold = eval_score_threshold self.image_size = image_size self.mini = mini self.small = small self.verbose = verbose self.include_unsupervised = include_unsupervised if self._split_scheme not in ['random', 'crossgeometry', 'zeroshot']: raise ValueError( f'Split scheme {self._split_scheme} not recognized') if mini: self._versions_dict = self._get_mini_versions_dict() elif small: self._versions_dict = self._get_small_versions_dict() # Select supervised-only dataset by default (smaller download). # Users must opt in with include_unsupervised=True to get the full dataset. if not include_unsupervised: modified_versions = {} for v, info in self._versions_dict.items(): modified_info = dict(info) if info.get('supervised_download_url') is not None: modified_info['download_url'] = info[ 'supervised_download_url'] modified_versions[v] = modified_info self._versions_dict = modified_versions if small: self._dataset_name = 'SmallTreeBoxes' else: self._dataset_name = 'TreeBoxes_supervised' # path self._data_dir = Path(self.initialize_data_dir(root_dir, download)) # Restore dataset name for proper operation after directory setup self._dataset_name = 'TreeBoxes' # Load splits (low_memory=False avoids mixed-type DtypeWarning on large CSVs) df = pd.read_csv(self._data_dir / f"{self._split_scheme}.csv", low_memory=False) for _c in ("xmin", "ymin", "xmax", "ymax"): df[_c] = pd.to_numeric(df[_c], errors="coerce") df = df.dropna(subset=["xmin", "ymin", "xmax", "ymax"]) df = df[(df["xmax"] > df["xmin"]) & (df["ymax"] > df["ymin"])].reset_index(drop=True) # Cache available sources for convenience self.sources = df['source'].unique() available_source_count = len(self.sources) # Remove incomplete data based on flag if remove_incomplete: df = df[df['complete'] == True] # Filter by include/exclude source names with wildcard support # Default: exclude sources containing 'unsupervised' unless include_unsupervised=True include_patterns = None if include_sources is not None and include_sources != []: include_patterns = include_sources if isinstance( include_sources, (list, tuple)) else [include_sources] exclude_patterns = exclude_sources if exclude_patterns is None: exclude_patterns = [] if include_unsupervised else [ '*unsupervised*' ] elif not isinstance(exclude_patterns, (list, tuple)): exclude_patterns = [exclude_patterns] source_str = df['source'].astype(str).str.lower() if include_patterns is not None: patterns_lower = [p.lower() for p in include_patterns] mask_include = source_str.apply( lambda s: any(fnmatch.fnmatch(s, p) for p in patterns_lower)) df = df[mask_include] patterns_exclude_lower = [p.lower() for p in exclude_patterns] if len(patterns_exclude_lower) > 0: mask_exclude = source_str.apply(lambda s: any( fnmatch.fnmatch(s, p) for p in patterns_exclude_lower)) df = df[~mask_exclude] selected_source_count = df['source'].nunique() df = df.reset_index(drop=True) # Splits self._split_dict = { 'train': 0, 'validation': 1, 'test': 2, } self._split_names = { 'train': 'Train', 'validation': 'Validation', 'test': 'Test (OOD/Trans)', } unique_files = df.drop_duplicates(subset=['filename'], inplace=False).reset_index(drop=True) unique_files['split_id'] = unique_files['split'].apply( lambda x: self._split_dict[x]) self._split_array = unique_files['split_id'].values # Filenames self._input_array = unique_files.filename # Create lookup table for which index to select for each filename self._input_lookup = df.groupby('filename').apply( lambda x: x.index.values, include_groups=False).to_dict() self._y_array = df[["xmin", "ymin", "xmax", "ymax"]].values.astype("float32") # Labels -> just 'Tree' self._n_classes = 1 # Length of targets self._y_size = 4 # Class labels self.labels = torch.zeros(df.shape[0]) # Create source locations with a numeric ID df["source_id"] = df.source.astype('category').cat.codes # Create filename numeric ID df["filename_id"] = df.filename.astype('category').cat.codes # Create dictionary for codes to names self._source_id_to_code = df.set_index('source_id')['source'].to_dict() self._filename_id_to_code = df.set_index( 'filename_id')['filename'].to_dict() # Expose source names to the grouper so per-source eval lines print the # source name instead of the numeric source_id. Ordered by source_id # (contiguous 0..n-1 from pandas category codes). self._metadata_map = { 'source_id': [ self._source_id_to_code[i] for i in sorted(self._source_id_to_code) ] } # Location/group info n_groups = max(df['source_id']) + 1 self._n_groups = n_groups assert len(np.unique(df['source_id'])) == self._n_groups # Metadata is at the image level unique_sources = df[['filename_id', 'source_id']].drop_duplicates( subset="filename_id", inplace=False).reset_index(drop=True) self._metadata_array = torch.tensor(unique_sources.values.astype('int')) self._metadata_fields = ['filename_id', 'source_id'] # Map source_id -> complete (used by CountingError to gate which images # contribute to MAE). Sources flagged complete=True in # source_completeness.csv are exhaustively annotated; others get NaN. if 'complete' in df.columns: source_complete = df.groupby('source_id')['complete'].first() self._source_id_complete = { int(k): bool(v) for k, v in source_complete.items() } else: self._source_id_complete = {} self._collate = TreeBoxesDataset._collate_fn self.metrics = { "accuracy": DetectionAccuracy(geometry_name=self.geometry_name, score_threshold=self.eval_score_threshold, metric="accuracy"), "recall": DetectionAccuracy(geometry_name=self.geometry_name, score_threshold=self.eval_score_threshold, metric="recall"), "maskaware_precision": MaskAwareDetectionPrecision( geometry_name=self.geometry_name, score_threshold=self.eval_score_threshold), "AP50": DetectionMAP(geometry_name=self.geometry_name, score_threshold=self.eval_score_threshold, iou_type="bbox", iou_thresholds=[0.5]), "merge_commission": MergeCommissionMetric( geometry_name=self.geometry_name, score_threshold=self.eval_score_threshold, modality="bbox", ), "counting_mae": CountingError( score_threshold=self.eval_score_threshold, geometry_name=self.geometry_name, ), } # eval grouper self._eval_grouper = CombinatorialGrouper(dataset=self, groupby_fields=(['source_id' ])) if self.verbose: n_train_images = int( (self._split_array == self._split_dict['train']).sum()) n_test_images = int( (self._split_array == self._split_dict['test']).sum()) print_dataset_summary( dataset_name=self._dataset_name, version=self.version, data_dir=self._data_dir, split_scheme=self._split_scheme, n_annotations=len(df), n_total_images=len(unique_files), n_train_images=n_train_images, n_test_images=n_test_images, n_available_sources=available_source_count, n_selected_sources=selected_source_count, mini=self.mini, small=self.small, include_patterns=include_patterns, exclude_patterns=exclude_patterns, ) super().__init__(root_dir, download, self._split_scheme)
[docs] def eval(self, y_pred, y_true, metadata, *, viz_dir=None, viz_n_per_source=10): """Performs evaluation on the given predictions. The main evaluation metric, detection_acc_avg_dom, measures the simple average of the detection accuracies of each domain. If ``viz_dir`` is set, writes overlay PNGs (purple = ground truth, orange = predictions above the eval score threshold), up to ``viz_n_per_source`` images per source, in subfolders named by source. """ results = {} results_str = '' for metric in self.metrics: result, result_str = self.standard_group_eval( self.metrics[metric], self._eval_grouper, y_pred, y_true, metadata) results[metric] = result results_str += result_str detection_accs = [] for k, v in results["accuracy"].items(): if k.startswith('detection_acc_source:'): d = k.split(':')[1] count = results["accuracy"][f'source:{d}'] if count > 0: detection_accs.append(v) detection_acc_avg_dom = np.array(detection_accs).mean() results['detection_acc_avg_dom'] = detection_acc_avg_dom results_str = f'Average detection_acc across source: {detection_acc_avg_dom:.3f}\n' + results_str # Format results with tables from milliontrees.common.utils import format_eval_results formatted_results = format_eval_results(results, self) results_str = formatted_results + '\n' + results_str if viz_dir is not None: paths = save_eval_visualizations( self, y_pred, y_true, metadata, viz_dir, n_per_source=viz_n_per_source, score_threshold=self.eval_score_threshold, ) results["eval_visualization_paths"] = [str(p) for p in paths] return results, results_str
def _get_mini_versions_dict(self): from milliontrees.common.release_sizes import subset_versions_dict return subset_versions_dict(self._versions_dict, "TreeBoxes", "Mini") def _get_small_versions_dict(self): from milliontrees.common.release_sizes import subset_versions_dict return subset_versions_dict(self._versions_dict, "TreeBoxes", "Small")
[docs] def get_input(self, idx): """Retrieves the input features (image) for a given data point. Args: idx (int): Index of a data point Returns: np.ndarray: Input features of the idx-th data point (image) as a normalized numpy array. """ # All images are in the images folder img_path = os.path.join(self._data_dir / 'images' / self._input_array[idx]) img = Image.open(img_path) img = np.array(img.convert('RGB')) / 255 img = np.array(img, dtype=np.float32) return img
@staticmethod def _collate_fn(batch): """Collates a batch by stacking `x` (features) and `metadata`, but not `y` (targets). The batch is initially a tuple of individual data points: (item1, item2, item3, ...). After zipping, it transforms into a list of tuples: [(item1[0], item2[0], ...), (item1[1], item2[1], ...), ...]. Args: batch (list): A batch of data points, where each data point is a tuple (metadata, x, y). Returns: tuple: A tuple containing: - Stacked `x` (features). - Stacked `metadata`. """ batch = list(zip(*batch)) batch[1] = torch.stack(batch[1]) batch[0] = torch.stack(batch[0]) batch[2] = list(batch[2]) return tuple(batch) def _transform_(self): transform = A.Compose([ A.Resize(height=self.image_size, width=self.image_size, p=1.0), ToTensorV2() ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels'], clip=True)) return transform