sgsPy
structurally guided sampling
Loading...
Searching...
No Matches
srs.h
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Project: sgs
4 * Purpose: C++ implementation of random sampling
5 * Author: Joseph Meyer
6 * Date: June, 2025
7 *
8 ******************************************************************************/
9
14
15#include <iostream>
16#include <random>
17
18#include <boost/unordered/unordered_flat_set.hpp>
19#include <xoshiro.h>
20
21#include "utils/access.h"
22#include "utils/existing.h"
23#include "utils/helper.h"
24#include "utils/raster.h"
25#include "utils/vector.h"
26
27namespace sgs {
28namespace srs {
29
51template <typename T>
52inline bool
55 int width,
56 int height,
57 int numSamples,
60 std::vector<helper::Index>& indices,
61 xso::xoshiro_4x64_plus rng)
62{
63 T nan = static_cast<T>(band.nan);
64
65 uint64_t width64 = static_cast<int64_t>(width);
66 uint64_t height64 = static_cast<int64_t>(height);
67
68 //get mask for rng output using bit twiddling
69 uint64_t maxIndex = width64 * height64 - 1;
70 uint64_t mask = maxIndex;
71 mask |= mask >> 1;
72 mask |= mask >> 2;
73 mask |= mask >> 4;
74 mask |= mask >> 8;
75 mask |= mask >> 16;
76 mask |= mask >> 32;
77
78 int iterations = 0;
79 int maxIterations = numSamples * 11;
80 boost::unordered::unordered_flat_set<uint64_t> indexSet;
81 while (iterations < maxIterations && indices.size() < static_cast<size_t>(numSamples)) {
82 //generate a random valid index
83 //
84 //bit shift by 11 because lower 11 bits are not random enough in this (fast) xoshiro configuration
85 uint64_t index = (rng() >> 11) & mask;
86 while (index > maxIndex) {
87 index = (rng() >> 11) & mask;
88 }
89 int x = static_cast<int>(index % width64);
90 int y = static_cast<int>(index / width64);
91
92 //read the value from that index
93 T val;
94 CPLErr err = band.p_band->RasterIO(GF_Read, x, y, 1, 1, &val, 1, 1, band.type, 0, 0);
95 if (err) {
96 throw std::runtime_error("error reading pixel from raster band using GDALRasterBand::RasterIO().");
97 }
98
99 //check if val is nan
100 bool isNan = std::isnan(val) || val == nan;
101 if (isNan) {
102 iterations++;
103 continue;
104 }
105
106 //check if pixel is accessible
107 bool accessible = !access.used;
108 if (access.used) {
109 int8_t accessVal;
110 CPLErr err = access.band.p_band->RasterIO(GF_Read, x, y, 1, 1, &accessVal, 1, 1, access.band.type, 0, 0);
111 if (err) {
112 throw std::runtime_error("error reading pixel from access raster band using GDALRasterBand::RasterIO().");
113 }
114 accessible = accessVal != 1;
115 }
116 if (!accessible) {
117 iterations++;
118 continue;
119 }
120
121 //check if pixel is already sampled
122 bool alreadySampled = existing.used && existing.containsIndex(x, y);
123 alreadySampled |= indexSet.find(index) != indexSet.end();
124 if (alreadySampled) {
125 iterations++;
126 continue;
127 }
128
129 //add index to indices map if the pixel is not nan, accessible, and not already sampled
130 indexSet.insert(index);
131 indices.push_back(helper::Index(x, y));
132 iterations++;
133 }
134
135 return indices.size() == static_cast<size_t>(numSamples);
136}
137
158template <typename T>
159inline void
164 std::vector<helper::Index>& indices,
166 int xBlock,
167 int yBlock,
168 int xValid,
169 int yValid)
170{
171 T nan = static_cast<T>(band.nan);
172 int8_t *p_access = reinterpret_cast<int8_t *>(access.band.p_buffer);
173
174 for (int y = 0; y < yValid; y++) {
175 size_t blockIndex = static_cast<size_t>(y * band.xBlockSize);
176 for (int x = 0; x < xValid; x++) {
177 //get val
178 T val = helper::getPixelValueDependingOnType<T>(band.type, band.p_buffer, blockIndex);
179
180 //check nan
181 bool isNan = val == nan || std::isnan(val);
182 if (isNan) {
183 blockIndex++;
184 continue;
185 }
186
187 //check access
188 if (access.used && p_access[blockIndex] == 1) {
189 blockIndex++;
190 continue;
191 }
192
193 helper::Index index = {x + xBlock * band.xBlockSize, y + yBlock * band.yBlockSize};
194
195 //check existing
196 if (existing.used && existing.containsIndex(index.x, index.y)) {
197 blockIndex++;
198 continue;
199 }
200
201 //check rand val
202 if (!rand.next()) {
203 blockIndex++;
204 continue;
205 }
206
207 //add index to indices
208 indices.push_back(index);
209
210 //increment within-block index
211 blockIndex++;
212 }
213 }
214}
215
283std::tuple<std::vector<std::vector<double>>, vector::GDALVectorWrapper *, size_t>
286 size_t numSamples,
287 double mindist,
288 vector::GDALVectorWrapper *p_existing,
290 std::string layerName,
291 double buffInner,
292 double buffOuter,
293 bool plot,
294 std::string tempFolder,
295 std::string filename)
296{
297 GDALAllRegister();
298
299 int width = p_raster->getWidth();
300 int height = p_raster->getHeight();
301 double *GT = p_raster->getGeotransform();
302 bool useMindist = mindist != 0;
303 std::mutex mutex;
304
305 std::vector<double> xCoords, yCoords;
306 std::vector<size_t> indexes;
307
308 //step 3: get first raster band
310 band.p_band = p_raster->getRasterBand(0);
311 band.type = p_raster->getRasterBandType(0);
312 band.size = p_raster->getRasterBandTypeSize(0);
313 band.nan = band.p_band->GetNoDataValue();
314 band.p_mutex = &mutex;
315 band.p_band->GetBlockSize(&band.xBlockSize, &band.yBlockSize);
316 band.p_buffer = VSIMalloc3(band.xBlockSize, band.yBlockSize, band.size);
317
318 //create output dataset before doing anything which will take a long time in case of failure.
319 GDALDriver *p_driver = GetGDALDriverManager()->GetDriverByName("MEM");
320 if (!p_driver) {
321 throw std::runtime_error("unable to create output sample dataset driver.");
322 }
323 GDALDataset *p_samples = p_driver->Create("", 0, 0, 0, GDT_Unknown, nullptr);
324 if (!p_samples) {
325 throw std::runtime_error("unable to create output dataset with driver.");
326 }
327
328 vector::GDALVectorWrapper *p_wrapper = new vector::GDALVectorWrapper(p_samples, std::string(p_raster->getDataset()->GetProjectionRef()));
329 OGRLayer *p_layer = p_samples->CreateLayer("samples", p_wrapper->getSRS(), wkbPoint, nullptr);
330 if (!p_layer) {
331 throw std::runtime_error("unable to create output dataset layer.");
332 }
333
334 //generate access structure
336 p_access,
337 p_raster,
338 layerName,
339 buffInner,
340 buffOuter,
341 true,
342 tempFolder,
343 band.xBlockSize,
344 band.yBlockSize
345 );
346
347 if (access.used) {
348 access.band.p_buffer = VSIMalloc3(band.xBlockSize, band.yBlockSize, access.band.size);
349 }
350
351 //generate existing structure
353 p_existing,
354 p_raster,
355 GT,
356 p_raster->getWidth(),
357 p_layer,
358 plot,
359 xCoords,
360 yCoords
361 );
362
363 int xBlocks = (width + band.xBlockSize - 1) / band.xBlockSize;
364 int yBlocks = (height + band.yBlockSize - 1) / band.yBlockSize;
365 std::vector<helper::Index> indices;
366
367 std::vector<bool> randVals(band.xBlockSize * band.yBlockSize);
368 int randValIndex = band.xBlockSize * band.yBlockSize;
369
370 //fast random number generator using xoshiro256++
371 //https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf
372 xso::xoshiro_4x64_plus rng;
373
374 // when reading pixels or blocks into memory using GDAL, the whole block is always read into memory.
375 // This reading of blocks is a large portion of the runtime for the processing of raster images.
376 //
377 // When using the Simple Random Sampling (srs) method, there is no requirement that the pixels
378 // have any relation to each other or represent any kind of distribution in the overall raster.
379 // This means that there is no requirement to read through the entire raster image.
380 //
381 // In some cases, it may be faster to select pixels at random from the image, hoping they are
382 // 1. not nan
383 // 2. acessible
384 // 3. not already sampled
385 //
386 // However, there are cases when more blocks will be read into memory by randomly selecting
387 // pixels compared to reading through the whole raster. This is because random access patterns
388 // have a VERY low hit rate in the cache, leading to significant thrashing -- we have to assume
389 // every time a pixel is read a block is read into the cache whereas when the entire raster is
390 // read sequentially, a new block must be read into the cache ONLY when the first pixel in that
391 // block is looked at.
392 //
393 // In the following, we attemt to estimate which method will result in the smaller number of
394 // blocks read into memory and use that to generate random pixels, assuming that half of the
395 // raster is nan.
396 //
397 // worth noting -- if not enough pixels are able to be determined using the random access method,
398 // the full raster read method is then used.
399
400 //total blocks is yBlocks * xBlocks
401 size_t numBlocks = static_cast<size_t>(xBlocks) * static_cast<size_t>(yBlocks);
402
403 //desired samples is x3 if using mindist
404 size_t desiredSamples = static_cast<size_t>(useMindist ? numSamples * 3 : numSamples);
405
406 //total number of pixels in the raster
407 size_t allPixels = static_cast<size_t>(width) * static_cast<size_t>(height);
408
409 //total area is width * height * pixelWidth * pixelHeight == allPixels * pixelWidth * pixelHeight
410 double totalArea = static_cast<double>(p_raster->getPixelHeight()) *
411 static_cast<double>(p_raster->getPixelWidth()) *
412 static_cast<double>(allPixels);
413
414 //accessible pixels is the proportion of pixels which are within the accessible area times the total pixel count
415 double accessiblePixels = allPixels * (access.used ? (access.area / totalArea) : 1.0);
416
417 //valid pixels divides the accessible pixels by 2 (assuming half nan), then subtracts existing samples
418 size_t validPixels = static_cast<size_t>(accessiblePixels / 2.0) - (existing.used ? existing.samples.size() : 0);
419
420 size_t randomAccessBlocksRequired = 0;
421
422 while (desiredSamples > 0 && randomAccessBlocksRequired < numBlocks) {
423 //the likelihood of guessing a valid pixel is equal to (remainingValidPixles / allPixels)
424 //
425 //therefore it is estimated there will be a total number of 1 / (remainingValidPixels / allPixels)
426 //guesses or random block accesses.
427 //
428 //This is equivalent to allPixels / validPixels.
429 //
430 //Then, since we sampled 1 pixel we reduce the number of valid remaining pixels by 1.
431
432 randomAccessBlocksRequired += (allPixels / validPixels);
433 validPixels--;
434 desiredSamples--;
435 }
436
437 //set max number of random accesses as the estimated required number * 1.25
438 size_t maxRandomAccessBlocks = randomAccessBlocksRequired + randomAccessBlocksRequired / 4;
439
440 //if the max estimated number of blocks read into memory under a random access strategy is less than
441 //reading the whole raster, try the random access strategy capping the number of accesses at this max value.
442 //
443 //Then, only read the entire raster if not enough pixels were read by the random strategy
444 bool haveEnoughSamples = false;
445 if (maxRandomAccessBlocks < numBlocks) {
446 desiredSamples = static_cast<size_t>(useMindist ? numSamples * 3 : numSamples);
447
448 switch (band.type) {
449 case GDT_Int8:
450 haveEnoughSamples = getRandomIndices<int8_t>(band, width, height, desiredSamples, access, existing, indices, rng);
451 break;
452 case GDT_UInt16:
453 haveEnoughSamples = getRandomIndices<uint16_t>(band, width, height, desiredSamples, access, existing, indices, rng);
454 break;
455 case GDT_Int16:
456 haveEnoughSamples = getRandomIndices<int16_t>(band, width, height, desiredSamples, access, existing, indices, rng);
457 break;
458 case GDT_UInt32:
459 haveEnoughSamples = getRandomIndices<uint32_t>(band, width, height, desiredSamples, access, existing, indices, rng);
460 break;
461 case GDT_Int32:
462 haveEnoughSamples = getRandomIndices<int32_t>(band, width, height, desiredSamples, access, existing, indices, rng);
463 break;
464 case GDT_Float32:
465 haveEnoughSamples = getRandomIndices<float>(band, width, height, desiredSamples, access, existing, indices, rng);
466 break;
467 case GDT_Float64:
468 haveEnoughSamples = getRandomIndices<double>(band, width, height, desiredSamples, access, existing, indices, rng);
469 break;
470 default:
471 throw std::runtime_error("raster pixel data type not supported.");
472 }
473 }
474
475 if (!haveEnoughSamples) {
476 //the multiplier which will be multiplied by the 53 most significant bits of the output of the
477 //random number generator to see whether a pixel should be added or not. The multiplier is
478 //a uint64_t number where the least significant n bits are 1 and the remaining are 0. The pixel
479 //is added when the least significant n bits (of the bit shifted 53 bits) within the rng match
480 //those of the multiplier. The probability a pixel is add is then (1/2^n). Using this method,
481 //knowing the amount of pixels, estimating the number of nan pixels, taking into account mindist
482 //and accessible area, we can estimate a percentage chance for each pixel and set up a multiplier
483 //to make that percentage happen. Doing this enables retaining only a small portion of pixel data
484 //and reducing memory footprint significantly, otherwise the index of every pixel
485 //would have to be stored, which would not be feasible for large rasters.
486 uint64_t multiplier = helper::getProbabilityMultiplier(
487 width,
488 height,
489 p_raster->getPixelWidth(),
490 p_raster->getPixelHeight(),
491 8,
492 numSamples,
493 useMindist,
494 access.area
495 );
496
497 helper::RandValController rand(band.xBlockSize, band.yBlockSize, multiplier, &rng);
498
499 for (int yBlock = 0; yBlock < yBlocks; yBlock++) {
500 for (int xBlock = 0; xBlock < xBlocks; xBlock++) {
501 int xValid, yValid;
502
503 //read block
504 band.p_band->GetActualBlockSize(xBlock, yBlock, &xValid, &yValid);
505 helper::rasterBandIO(band, band.p_buffer, band.xBlockSize, band.yBlockSize, xBlock, yBlock, xValid, yValid, true, false);
506 //read = true
507 //read access block if necessary
508 if (access.used) {
509 helper::rasterBandIO(access.band, access.band.p_buffer, band.xBlockSize, band.yBlockSize, xBlock, yBlock, xValid, yValid, true, false);
510 }
511
512 //calculate random values
513 rand.calculateRandValues();
514
515 //process block
516 switch (band.type) {
517 case GDT_Int8:
518 processBlock<int8_t>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
519 break;
520 case GDT_UInt16:
521 processBlock<uint16_t>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
522 break;
523 case GDT_Int16:
524 processBlock<int16_t>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
525 break;
526 case GDT_UInt32:
527 processBlock<uint32_t>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
528 break;
529 case GDT_Int32:
530 processBlock<int32_t>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
531 break;
532 case GDT_Float32:
533 processBlock<float>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
534 break;
535 case GDT_Float64:
536 processBlock<double>(band, access, existing, indices, rand, xBlock, yBlock, xValid, yValid);
537 break;
538 default:
539 throw std::runtime_error("raster pixel data type is not supported.");
540 }
541 }
542 }
543 }
544
545 std::shuffle(indices.begin(), indices.end(), rng);
546
547 size_t samplesAdded = existing.used ? existing.count() : 0;
548 size_t i = 0;
549 helper::NeighborMap neighbor_map;
550 double mindist_sq = mindist * mindist;
551
552 helper::Field fieldExistingFalse("existing", 0);
553 while (samplesAdded < numSamples && i < indices.size()) {
554 helper::Index index = indices[i];
555 bool valid = true;
556 const auto [x, y] = helper::sample_to_point(GT, index);
557
558 if (useMindist) {
559 valid = helper::is_valid_sample(x, y, neighbor_map, mindist, mindist_sq);
560 }
561
562 if (valid) {
563 OGRPoint point = OGRPoint(x, y);
564
565 existing.used ?
566 helper::addPoint(&point, p_layer, &fieldExistingFalse) :
567 helper::addPoint(&point, p_layer);
568
569 samplesAdded++;
570
571 if (plot) {
572 xCoords.push_back(x);
573 yCoords.push_back(y);
574 }
575 }
576 i++;
577 }
578
579
580 if (filename != "") {
581 try {
582 p_wrapper->write(filename);
583 }
584 catch (const std::exception& e) {
585 std::cout << "Exception thrown trying to write file: " << e.what() << std::endl;
586 }
587 }
588
589 //free allocated memory
590 VSIFree(band.p_buffer);
591 if (access.used) {
592 VSIFree(access.band.p_buffer);
593 }
594
595 return {{xCoords, yCoords}, p_wrapper, samplesAdded};
596}
597
598} //namespace srs
599} //namespace sgs
Definition helper.h:976
void calculateRandValues(void)
Definition helper.h:1019
bool next(void)
Definition helper.h:1034
Definition raster.h:57
GDALDataset * getDataset()
Definition raster.h:429
int getWidth()
Definition raster.h:467
size_t getRasterBandTypeSize(int band)
Definition raster.h:735
double * getGeotransform()
Definition raster.h:590
GDALDataType getRasterBandType(int band)
Definition raster.h:724
int getHeight()
Definition raster.h:476
GDALRasterBand * getRasterBand(int band)
Definition raster.h:696
double getPixelWidth()
Definition raster.h:555
double getPixelHeight()
Definition raster.h:565
Definition vector.h:46
OGRSpatialReference * getSRS(void)
Definition vector.h:457
void write(std::string filename)
Definition vector.h:397
boost::unordered::unordered_flat_map< std::pair< int, int >, std::vector< std::pair< double, double > >, PointHash > NeighborMap
Definition helper.h:1061
uint64_t getProbabilityMultiplier(double width, double height, double pixelWidth, double pixelHeight, int startMult, int numSamples, bool useMindist, double accessibleArea)
Definition helper.h:940
void rasterBandIO(RasterBandMetaData &band, void *p_buffer, int xBlockSize, int yBlockSize, int xBlock, int yBlock, int xValid, int yValid, bool read, bool threaded=true)
Definition helper.h:643
std::pair< double, double > sample_to_point(double *GT, Index &index)
Definition helper.h:1108
T getPixelValueDependingOnType(GDALDataType type, void *p_data, size_t index)
Definition helper.h:173
bool is_valid_sample(double x, double y, NeighborMap &neighbor_map, float mindist, float mindist_sq)
Definition helper.h:1072
void addPoint(OGRPoint *p_point, OGRLayer *p_layer)
Definition helper.h:700
bool getRandomIndices(helper::RasterBandMetaData &band, int width, int height, int numSamples, access::Access &access, existing::Existing &existing, std::vector< helper::Index > &indices, xso::xoshiro_4x64_plus rng)
Definition srs.h:53
void processBlock(helper::RasterBandMetaData &band, access::Access &access, existing::Existing &existing, std::vector< helper::Index > &indices, helper::RandValController &rand, int xBlock, int yBlock, int xValid, int yValid)
Definition srs.h:160
std::tuple< std::vector< std::vector< double > >, vector::GDALVectorWrapper *, size_t > srs(raster::GDALRasterWrapper *p_raster, size_t numSamples, double mindist, vector::GDALVectorWrapper *p_existing, vector::GDALVectorWrapper *p_access, std::string layerName, double buffInner, double buffOuter, bool plot, std::string tempFolder, std::string filename)
Definition srs.h:284
Definition access.h:23
Definition existing.h:27
Definition pca.h:23
Definition access.h:30
Definition existing.h:38
Definition helper.h:46
Definition helper.h:37
int x
Definition helper.h:38
int y
Definition helper.h:39
Definition helper.h:87
int xBlockSize
Definition helper.h:94
std::mutex * p_mutex
Definition helper.h:96
void * p_buffer
Definition helper.h:89
int yBlockSize
Definition helper.h:95
size_t size
Definition helper.h:91
GDALDataType type
Definition helper.h:90
double nan
Definition helper.h:93
GDALRasterBand * p_band
Definition helper.h:88