Source code for kwcoco.data.grab_spacenet

"""
References:
    https://medium.com/the-downlinq/the-spacenet-7-multi-temporal-urban-development-challenge-algorithmic-baseline-4515ec9bd9fe
    https://arxiv.org/pdf/2102.11958.pdf
    https://spacenet.ai/sn7-challenge/
"""
import ubelt as ub
import os
from kwcoco.util import util_archive


[docs]def grab_spacenet7(data_dpath): """ References: https://spacenet.ai/sn7-challenge/ Requires: awscli Ignore: mkdir -p $HOME/.cache/kwcoco/data/spacenet/archives cd $HOME/.cache/kwcoco/data/spacenet/archives # Requires an AWS account export AWS_PROFILE=joncrall_at_kitware aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz . aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train_csvs.tar.gz . aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz . """ import ubelt as ub import pathlib import kwcoco dpath = ub.ensuredir((data_dpath, 'spacenet')) coco_fpath = os.path.join(dpath, 'spacenet7.kwcoco.json') archive_dpath = pathlib.Path(ub.ensuredir((dpath, 'archives'))) extract_dpath = pathlib.Path(ub.ensuredir((dpath, 'extracted'))) stamp = ub.CacheStamp('convert_spacenet', dpath=dpath, depends=['v001']) if stamp.expired(): items = [ { 'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz', 'sha512': '5f810682825859951e55f6a3bf8e96eb6eb85864a90d75349', }, { 'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train_csvs.tar.gz', 'sha512': 'e4314ac129dd76e7984556c243b7b5c0c238085110ed7f7f619cb0', }, { 'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz', 'sha512': '0677a20f972cc463828bbff8d2fae08e17fdade3cf17ce213dc978', }, ] has_extracted = all([ d.exists() for d in [extract_dpath / 'csvs', extract_dpath / 'test_public', extract_dpath / 'train']]) for item in items: fname = pathlib.Path(item['uri']).name item['fpath'] = archive_dpath / fname need_download_archive = not has_extracted if need_download_archive: aws_exe = ub.find_exe('aws') if not aws_exe: raise Exception('requires aws exe') for item in items: if not item['fpath'].exists(): command = '{aws_exe} s3 cp {uri} {archive_dpath}'.format( aws_exe=aws_exe, uri=item['uri'], archive_dpath=archive_dpath) info = ub.cmd(command, verbose=3) assert info['ret'] == 0 got_hash = ub.hash_file(item['fpath'], hasher='sha512') assert got_hash.startswith(item['sha512']) need_unarchive = not has_extracted if need_unarchive: for item in ub.ProgIter(items, desc='extract spacenet', verbose=3): archive_fpath = item['fpath'] util_archive.unarchive_file(archive_fpath, extract_dpath, overwrite=0, verbose=2) coco_dset = convert_spacenet_to_kwcoco(extract_dpath, coco_fpath) stamp.renew() coco_dset = kwcoco.CocoDataset(coco_fpath) dsets = [coco_dset] return dsets
[docs]def convert_spacenet_to_kwcoco(extract_dpath, coco_fpath): """ Converts the raw SpaceNet7 dataset to kwcoco Note: * The "train" directory contains 60 "videos" representing a region over time. * Each "video" directory contains : * images - unmasked images * images_masked - images with masks applied * labels - geojson polys in wgs84? * labels_match - geojson polys in wgs84 with track ids? * labels_match_pix - geojson polys in pixels with track ids? * UDM_masks - unusable data masks (binary data corresponding with an image, may not exist) File names appear like: "global_monthly_2018_01_mosaic_L15-1538E-1163N_6154_3539_13" Ignore: dpath = pathlib.Path("/home/joncrall/data/dvc-repos/smart_watch_dvc/extern/spacenet/") extract_dpath = dpath / 'extracted' coco_fpath = dpath / 'spacenet7.kwcoco.json' """ import kwcoco import json import kwimage import parse import datetime print('Convert Spacenet7 to kwcoco') coco_dset = kwcoco.CocoDataset() coco_dset.fpath = coco_fpath building_cid = coco_dset.ensure_category('building') ignore_cid = coco_dset.ensure_category('ignore') s7_fname_fmt = parse.Parser('global_monthly_{year:d}_{month:d}_mosaic_{}') # Add images tile_dpaths = list(extract_dpath.glob('train/*')) for tile_dpath in ub.ProgIter(tile_dpaths, desc='add video'): tile_name = tile_dpath.name vidid = coco_dset.add_video(name=tile_name) image_gpaths = sorted(tile_dpath.glob('images/*')) # sorted(tile_dpath.glob('labels/*')) # sorted(tile_dpath.glob('images_masked/*')) # sorted(tile_dpath.glob('labels_match/*')) # udm_fpaths = sorted(tile_dpath.glob('UDM_masks/*')) for frame_index, gpath in enumerate(image_gpaths): gname = str(gpath.stem) nameinfo = s7_fname_fmt.parse(gname) timestamp = datetime.datetime( year=nameinfo['year'], month=nameinfo['month'], day=1) gid = coco_dset.add_image( file_name=str(gpath.relative_to(coco_dset.bundle_dpath)), name=gname, video_id=vidid, frame_index=frame_index, date_captured=timestamp.isoformat(), channels='r|g|b', ) coco_dset._ensure_imgsize() # Add annotations def _from_geojson2(geometry): import numpy as np coords = geometry['coordinates'] exterior = np.array(coords[0])[:, 0:2] interiors = [np.array(h)[:, 0:2] for h in coords[1:]] poly_data = dict(exterior=kwimage.Coords(exterior), interiors=[kwimage.Coords(hole) for hole in interiors]) self = kwimage.Polygon(data=poly_data) return self all_label_fpaths = sorted(extract_dpath.glob('train/*/labels_match_pix/*')) for label_fpath in ub.ProgIter(all_label_fpaths, desc='add annots'): # Remove trailing suffix name_parts = label_fpath.stem.split('_') assert name_parts[-1] == 'Buildings' name = '_'.join(name_parts[:-1]) with open(label_fpath, 'r') as file: label_data = json.load(file) assert label_data['type'] == 'FeatureCollection' for feat in label_data['features']: prop = feat['properties'] gid = coco_dset.index.name_to_img[name]['id'] # from_geojson is slow! # poly = kwimage.Polygon.from_geojson(feat['geometry']) poly = _from_geojson2(feat['geometry']) # This is a bottleneck boxes = poly.bounding_box() boxes = boxes.quantize() xywh = boxes.to_xywh().data[0].tolist() ann = { 'bbox': xywh, 'image_id': gid, 'category_id': building_cid, 'track_id': prop['Id'], 'area': prop['area'], 'segmentation': poly.to_coco(style='new') } coco_dset.add_annotation(**ann) all_udm_fpaths = sorted(extract_dpath.glob('train/*/UDM_masks/*')) for udm_fpath in ub.ProgIter(all_udm_fpaths, desc='add ignore masks'): name_parts = udm_fpath.stem.split('_') assert name_parts[-1] == 'UDM' name = '_'.join(name_parts[:-1]) gid = coco_dset.index.name_to_img[name]['id'] c_mask = kwimage.imread(str(udm_fpath)) c_mask[c_mask == 255] = 1 mask = kwimage.Mask(c_mask, 'c_mask') poly = mask.to_multi_polygon() xywh = ub.peek(poly.bounding_box().quantize().to_coco()) ann = { 'bbox': xywh, 'image_id': gid, 'category_id': ignore_cid, 'segmentation': poly.to_coco(style='new') } coco_dset.add_annotation(**ann) print('coco_dset.fpath = {!r}'.format(coco_dset.fpath)) print('coco_dset = {!r}'.format(coco_dset)) coco_dset.dump(str(coco_dset.fpath)) # We will generally want an SQL cache when working with this dataset if ub.argflag('--sql-hack'): from kwcoco.coco_sql_dataset import CocoSqlDatabase CocoSqlDatabase.coerce(coco_dset) return coco_dset
[docs]def main(): data_dpath = ub.ensure_app_cache_dir('kwcoco', 'data') grab_spacenet7(data_dpath)
if __name__ == '__main__': """ CommandLine: python ~/code/kwcoco/kwcoco/data/grab_spacenet.py """ main()