"""
References:
https://medium.com/the-downlinq/the-spacenet-7-multi-temporal-urban-development-challenge-algorithmic-baseline-4515ec9bd9fe
https://arxiv.org/pdf/2102.11958.pdf
https://spacenet.ai/sn7-challenge/
"""
from os.path import dirname
from os.path import exists
from os.path import join
import ubelt as ub
import os
import tarfile
import zipfile
from os.path import relpath
[docs]class Archive(object):
"""
Abstraction over zipfile and tarfile
SeeAlso:
https://github.com/RKrahl/archive-tools
https://pypi.org/project/arlib/
Example:
>>> from kwcoco.data.grab_spacenet import * # NOQA
>>> from os.path import join
>>> dpath = ub.ensure_app_cache_dir('ubelt', 'tests', 'archive')
>>> ub.delete(dpath)
>>> dpath = ub.ensure_app_cache_dir(dpath)
>>> import pathlib
>>> dpath = pathlib.Path(dpath)
>>> #
>>> #
>>> mode = 'w'
>>> self1 = Archive(str(dpath / 'demo.zip'), mode=mode)
>>> self2 = Archive(str(dpath / 'demo.tar.gz'), mode=mode)
>>> #
>>> open(dpath / 'data_1only.txt', 'w').write('bazbzzz')
>>> open(dpath / 'data_2only.txt', 'w').write('buzzz')
>>> open(dpath / 'data_both.txt', 'w').write('foobar')
>>> #
>>> self1.add(dpath / 'data_both.txt')
>>> self1.add(dpath / 'data_1only.txt')
>>> #
>>> self2.add(dpath / 'data_both.txt')
>>> self2.add(dpath / 'data_2only.txt')
>>> #
>>> self1.close()
>>> self2.close()
>>> #
>>> self1 = Archive(str(dpath / 'demo.zip'), mode='r')
>>> self2 = Archive(str(dpath / 'demo.tar.gz'), mode='r')
>>> #
>>> extract_dpath = ub.ensuredir(str(dpath / 'extracted'))
>>> extracted1 = self1.extractall(extract_dpath)
>>> extracted2 = self2.extractall(extract_dpath)
>>> for fpath in extracted2:
>>> print(open(fpath, 'r').read())
>>> for fpath in extracted1:
>>> print(open(fpath, 'r').read())
"""
def __init__(self, fpath, mode='r'):
self.fpath = fpath
self.mode = mode
self.file = None
self.backend = None
exist_flag = os.path.exists(fpath)
if fpath.endswith('.tar.gz'):
self.backend = tarfile
elif fpath.endswith('.zip'):
self.backend = zipfile
else:
raise NotImplementedError('no-exist case')
if self.backend is zipfile:
if exist_flag and not zipfile.is_zipfile(fpath):
raise Exception('corrupted zip?')
self.file = zipfile.ZipFile(fpath, mode=mode)
elif self.backend is tarfile:
if exist_flag and not tarfile.is_tarfile(fpath):
raise Exception('corrupted tar.gz?')
self.file = tarfile.open(fpath, mode + ':gz')
else:
raise NotImplementedError
[docs] def __iter__(self):
if self.backend is tarfile:
return (mem.name for mem in self.file)
elif self.backend is zipfile:
# does zip have an iterable structure?
return iter(self.file.namelist())
[docs] def add(self, fpath, arcname=None):
if arcname is None:
arcname = relpath(fpath, dirname(self.fpath))
if self.backend is tarfile:
self.file.add(fpath, arcname)
if self.backend is zipfile:
self.file.write(fpath, arcname)
[docs] def close(self):
return self.file.close()
[docs] def __enter__(self):
self.__file__.__enter__()
return self
[docs] def __exit__(self, *args):
self.__file__.__exit__(*args)
[docs]def unarchive_file(archive_fpath, output_dpath='.', verbose=1, overwrite=True):
import tarfile
import zipfile
if verbose:
print('Unarchive archive_fpath = {!r} in {}'.format(archive_fpath, output_dpath))
archive_file = None
try:
if tarfile.is_tarfile(archive_fpath):
archive_file = tarfile.open(archive_fpath, 'r:gz')
archive_namelist = [
mem.path for mem in ub.ProgIter(
iter(archive_file), desc='enumerate members')
]
elif zipfile.is_zipfile(archive_fpath):
zip_file = zipfile.ZipFile(archive_fpath)
if verbose:
print('Enumerate members')
archive_namelist = zip_file.namelist()
else:
raise NotImplementedError
unarchived_paths = []
for member in ub.ProgIter(archive_namelist, desc='extracting',
verbose=verbose):
fpath = join(output_dpath, member)
unarchived_paths.append(fpath)
if not overwrite and exists(fpath):
continue
ub.ensuredir(dirname(fpath))
archive_file.extract(member, path=output_dpath)
finally:
if archive_file is not None:
archive_file.close()
return unarchived_paths
[docs]def grab_spacenet7(data_dpath):
"""
References:
https://spacenet.ai/sn7-challenge/
Requires:
awscli
Ignore:
mkdir -p $HOME/.cache/kwcoco/data/spacenet/archives
cd $HOME/.cache/kwcoco/data/spacenet/archives
# Requires an AWS account
export AWS_PROFILE=joncrall_at_kitware
aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz .
aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train_csvs.tar.gz .
aws s3 cp s3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz .
"""
import ubelt as ub
import pathlib
import kwcoco
dpath = ub.ensuredir((data_dpath, 'spacenet'))
coco_fpath = os.path.join(dpath, 'spacenet7.kwcoco.json')
archive_dpath = pathlib.Path(ub.ensuredir((dpath, 'archives')))
extract_dpath = pathlib.Path(ub.ensuredir((dpath, 'extracted')))
stamp = ub.CacheStamp('convert_spacenet', dpath=dpath, depends=['v001'])
if stamp.expired():
items = [
{
'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train.tar.gz',
'sha512': '5f810682825859951e55f6a3bf8e96eb6eb85864a90d75349',
},
{
'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_train_csvs.tar.gz',
'sha512': 'e4314ac129dd76e7984556c243b7b5c0c238085110ed7f7f619cb0',
},
{
'uri': 's3://spacenet-dataset/spacenet/SN7_buildings/tarballs/SN7_buildings_test_public.tar.gz',
'sha512': '0677a20f972cc463828bbff8d2fae08e17fdade3cf17ce213dc978',
},
]
has_extracted = all([
d.exists() for d in [extract_dpath / 'csvs',
extract_dpath / 'test_public',
extract_dpath / 'train']])
for item in items:
fname = pathlib.Path(item['uri']).name
item['fpath'] = archive_dpath / fname
need_download_archive = not has_extracted
if need_download_archive:
aws_exe = ub.find_exe('aws')
if not aws_exe:
raise Exception('requires aws exe')
for item in items:
if not item['fpath'].exists():
command = '{aws_exe} s3 cp {uri} {archive_dpath}'.format(
aws_exe=aws_exe, uri=item['uri'], archive_dpath=archive_dpath)
info = ub.cmd(command, verbose=3)
assert info['ret'] == 0
got_hash = ub.hash_file(item['fpath'], hasher='sha512')
assert got_hash.startswith(item['sha512'])
need_unarchive = not has_extracted
if need_unarchive:
for item in ub.ProgIter(items, desc='extract spacenet', verbose=3):
archive_fpath = item['fpath']
unarchive_file(archive_fpath, extract_dpath, overwrite=0, verbose=2)
coco_dset = convert_spacenet_to_kwcoco(extract_dpath, coco_fpath)
stamp.renew()
coco_dset = kwcoco.CocoDataset(coco_fpath)
dsets = [coco_dset]
return dsets
[docs]def convert_spacenet_to_kwcoco(extract_dpath, coco_fpath):
"""
Converts the raw SpaceNet7 dataset to kwcoco
Note:
* The "train" directory contains 60 "videos" representing a region over time.
* Each "video" directory contains :
* images - unmasked images
* images_masked - images with masks applied
* labels - geojson polys in wgs84?
* labels_match - geojson polys in wgs84 with track ids?
* labels_match_pix - geojson polys in pixels with track ids?
* UDM_masks - unusable data masks (binary data corresponding with an image, may not exist)
File names appear like:
"global_monthly_2018_01_mosaic_L15-1538E-1163N_6154_3539_13"
Ignore:
dpath = pathlib.Path("/home/joncrall/data/dvc-repos/smart_watch_dvc/extern/spacenet/")
extract_dpath = dpath / 'extracted'
coco_fpath = dpath / 'spacenet7.kwcoco.json'
"""
import kwcoco
import json
import kwimage
import parse
import datetime
print('Convert Spacenet7 to kwcoco')
coco_dset = kwcoco.CocoDataset()
coco_dset.fpath = coco_fpath
building_cid = coco_dset.ensure_category('building')
ignore_cid = coco_dset.ensure_category('ignore')
s7_fname_fmt = parse.Parser('global_monthly_{year:d}_{month:d}_mosaic_{}')
# Add images
tile_dpaths = list(extract_dpath.glob('train/*'))
for tile_dpath in ub.ProgIter(tile_dpaths, desc='add video'):
tile_name = tile_dpath.name
vidid = coco_dset.add_video(name=tile_name)
image_gpaths = sorted(tile_dpath.glob('images/*'))
# sorted(tile_dpath.glob('labels/*'))
# sorted(tile_dpath.glob('images_masked/*'))
# sorted(tile_dpath.glob('labels_match/*'))
# udm_fpaths = sorted(tile_dpath.glob('UDM_masks/*'))
for frame_index, gpath in enumerate(image_gpaths):
gname = str(gpath.stem)
nameinfo = s7_fname_fmt.parse(gname)
timestamp = datetime.datetime(
year=nameinfo['year'], month=nameinfo['month'], day=1)
gid = coco_dset.add_image(
file_name=str(gpath.relative_to(coco_dset.bundle_dpath)),
name=gname,
video_id=vidid,
frame_index=frame_index,
date_captured=timestamp.isoformat(),
channels='r|g|b',
)
coco_dset._ensure_imgsize()
# Add annotations
def _from_geojson2(geometry):
import numpy as np
coords = geometry['coordinates']
exterior = np.array(coords[0])[:, 0:2]
interiors = [np.array(h)[:, 0:2] for h in coords[1:]]
poly_data = dict(exterior=kwimage.Coords(exterior),
interiors=[kwimage.Coords(hole)
for hole in interiors])
self = kwimage.Polygon(data=poly_data)
return self
all_label_fpaths = sorted(extract_dpath.glob('train/*/labels_match_pix/*'))
for label_fpath in ub.ProgIter(all_label_fpaths, desc='add annots'):
# Remove trailing suffix
name_parts = label_fpath.stem.split('_')
assert name_parts[-1] == 'Buildings'
name = '_'.join(name_parts[:-1])
with open(label_fpath, 'r') as file:
label_data = json.load(file)
assert label_data['type'] == 'FeatureCollection'
for feat in label_data['features']:
prop = feat['properties']
gid = coco_dset.index.name_to_img[name]['id']
# from_geojson is slow!
# poly = kwimage.Polygon.from_geojson(feat['geometry'])
poly = _from_geojson2(feat['geometry'])
# This is a bottleneck
boxes = poly.bounding_box()
boxes = boxes.quantize()
xywh = boxes.to_xywh().data[0].tolist()
ann = {
'bbox': xywh,
'image_id': gid,
'category_id': building_cid,
'track_id': prop['Id'],
'area': prop['area'],
'segmentation': poly.to_coco(style='new')
}
coco_dset.add_annotation(**ann)
all_udm_fpaths = sorted(extract_dpath.glob('train/*/UDM_masks/*'))
for udm_fpath in ub.ProgIter(all_udm_fpaths, desc='add ignore masks'):
name_parts = udm_fpath.stem.split('_')
assert name_parts[-1] == 'UDM'
name = '_'.join(name_parts[:-1])
gid = coco_dset.index.name_to_img[name]['id']
c_mask = kwimage.imread(str(udm_fpath))
c_mask[c_mask == 255] = 1
mask = kwimage.Mask(c_mask, 'c_mask')
poly = mask.to_multi_polygon()
xywh = ub.peek(poly.bounding_box().quantize().to_coco())
ann = {
'bbox': xywh,
'image_id': gid,
'category_id': ignore_cid,
'segmentation': poly.to_coco(style='new')
}
coco_dset.add_annotation(**ann)
print('coco_dset.fpath = {!r}'.format(coco_dset.fpath))
print('coco_dset = {!r}'.format(coco_dset))
coco_dset.dump(str(coco_dset.fpath))
# We will generally want an SQL cache when working with this dataset
if ub.argflag('--sql-hack'):
from kwcoco.coco_sql_dataset import CocoSqlDatabase
CocoSqlDatabase.coerce(coco_dset)
return coco_dset
[docs]def main():
data_dpath = ub.ensure_app_cache_dir('kwcoco', 'data')
grab_spacenet7(data_dpath)
if __name__ == '__main__':
"""
CommandLine:
python ~/code/kwcoco/kwcoco/data/grab_spacenet.py
"""
main()