Source code for raptorstats.agqt

import os
import warnings

import geopandas as gpd
import numpy as np
import rasterio as rio
from rtree import index
from shapely import box

from raptorstats.node import Node
from raptorstats.scanline import (
    Scanline,
    build_intersection_table,
    build_reading_table,
    process_reading_table,
)
from raptorstats.zonal_stat_method import ZonalStatMethod


[docs] class AggQuadTree(ZonalStatMethod): __name__ = "AggQuadTree" def __init__( self, max_depth: int = 5, index_path: str = None, build_index: bool = False ): self.max_depth = max_depth if index_path is None: index_path = f"index_{self.__name__}_depth_{max_depth}" else: # strip .{shit} if exists index_path = index_path.split(".")[0] self.index_path = index_path self.build_index = build_index super().__init__() def _index_files_exist(self): idx_file = f"{self.index_path}.idx" dat_file = f"{self.index_path}.dat" return os.path.exists(idx_file) and os.path.exists(dat_file) def _should_build_index(self): return self.build_index or not self._index_files_exist() def _get_params(self): return {"max_depth": self.max_depth, "index_path": self.index_path} def _compute_scanline_reading_table( self, features: gpd.GeoDataFrame, raster: rio.DatasetReader ): f_index, intersection_coords = build_intersection_table(features, raster) if intersection_coords.size == 0: reading_table = np.empty((0, 7), dtype=int) f_index_starts = np.array([], dtype=int) unique_f_indices = np.array([], dtype=int) self._reading_table = reading_table self.f_index_starts = f_index_starts self.unique_f_indices = unique_f_indices self.n_features = len(features.geometry) self.effective_n_features = 0 return reading_table = build_reading_table( f_index, intersection_coords, raster, return_coordinates=False, sort_by_feature=True, ) self._reading_table = reading_table # row, col0, col1, f_index unique_f_indices, f_index_starts = np.unique( reading_table[:, 3], return_index=True ) self.f_index_starts = f_index_starts self.unique_f_indices = unique_f_indices self.n_features = len(features.geometry) self.effective_n_features = len(f_index_starts) def _compute_quad_tree(self, raster: rio.DatasetReader): def part1by1(n): n &= 0x0000FFFF n = (n | (n << 8)) & 0x00FF00FF n = (n | (n << 4)) & 0x0F0F0F0F n = (n | (n << 2)) & 0x33333333 n = (n | (n << 1)) & 0x55555555 return n def z_order_indices(xs, ys): return (part1by1(ys) << 1) | part1by1(xs) def get_boxes_aligned_even(divisions: int, raster: rio.DatasetReader): """Build a divisions x divisions grid of pixel-aligned tiles with remainder distributed as evenly as possible. Returns (boxes, ix, iy) with boxes as (left, bottom, right, top) in CRS coords. """ from rasterio.windows import Window from rasterio.windows import bounds as win_bounds if divisions <= 0: raise ValueError("divisions must be > 0") W, H = raster.width, raster.height # Integer edges: floor(i*W/D), floor(j*H/D) col_edges = (np.arange(divisions + 1) * W) // divisions row_edges = (np.arange(divisions + 1) * H) // divisions col_offs = col_edges[:-1].astype(int) row_offs = row_edges[:-1].astype(int) widths = np.diff(col_edges).astype(int) heights = np.diff(row_edges).astype(int) boxes, ix, iy = [], [], [] for iy_ in range(divisions): h = heights[iy_] r0 = row_offs[iy_] for ix_ in range(divisions): w = widths[ix_] c0 = col_offs[ix_] win = Window(c0, r0, w, h) boxes.append(win_bounds(win, raster.transform)) ix.append(ix_) iy.append(iy_) return np.asarray(boxes, float), np.asarray(ix, int), np.asarray(iy, int) def coarsen_from_children(boxes, ix, iy, divisions): """Given current level (divisions×divisions) tiles, build parent level ((div/2)x(div/2)) by grouping children with parent indices (ix//2, iy//2) and unioning bounds.""" parent_div = divisions // 2 pix = ix // 2 piy = iy // 2 # Composite key to group by parent cell keys = piy * parent_div + pix order = np.argsort(keys) boxes = boxes[order] pix = pix[order] piy = piy[order] keys = keys[order] uniq, starts = np.unique(keys, return_index=True) out_boxes = np.empty((len(uniq), 4), dtype=boxes.dtype) out_ix = np.empty(len(uniq), dtype=int) out_iy = np.empty(len(uniq), dtype=int) for k in range(len(uniq)): s = starts[k] e = starts[k + 1] if k + 1 < len(uniq) else len(keys) g = boxes[s:e] # union bounds out_boxes[k, 0] = np.min(g[:, 0]) # left out_boxes[k, 1] = np.min(g[:, 1]) # bottom out_boxes[k, 2] = np.max(g[:, 2]) # right out_boxes[k, 3] = np.max(g[:, 3]) # top out_ix[k] = pix[s] out_iy[k] = piy[s] return out_boxes, out_ix, out_iy n_pixels_width = raster.width n_pixels_height = raster.height min_size = min(n_pixels_width, n_pixels_height) if min_size / (2**self.max_depth) < 1: raise ValueError( f"Max depth {self.max_depth} is too high for raster size {n_pixels_width}x{n_pixels_height}. " "Reduce max_depth." ) box_index = 0 idx = index.Index(self.index_path) all_nodes = [] # Build leaf level level = self.max_depth divisions = 2**level boxes, ix, iy = get_boxes_aligned_even(divisions, raster) previous_aggregates = None previous_parent_ids = None # Iterate from leaves up to root, coarsening for level in range(self.max_depth, -1, -1): divisions = 2**level z_indices = z_order_indices(ix, iy) sort_idx = np.argsort(z_indices) boxes_lvl = boxes[sort_idx] ix_lvl = ix[sort_idx] iy_lvl = iy[sort_idx] z_lvl = z_indices[sort_idx] ids = z_lvl + box_index shp_boxes = [box(x0, y0, x1, y1) for (x0, y0, x1, y1) in boxes_lvl] n_boxes = len(boxes_lvl) box_index += n_boxes # advance global id space to parent level if level == 0: parent_ids = np.array([-1]) else: parent_ids = ( z_lvl // 4 ) + box_index # parent ids live in the next chunk if level == self.max_depth: # Compute aggregates only once (on leaves) vector_layer = gpd.GeoDataFrame(geometry=shp_boxes, crs=raster.crs) sc = Scanline() aggregates = np.array(sc(raster, vector_layer, self.stats)) else: # Combine child aggregates into parents using parent_ids computed at previous (finer) level aggregates = [] for p_id in np.unique(previous_parent_ids): mask = previous_parent_ids == p_id child_aggregates = previous_aggregates[mask] aggregates.append(self.stats.from_partials(child_aggregates)) aggregates = np.array(aggregates) previous_aggregates = aggregates previous_parent_ids = parent_ids # Emit nodes for this level for i in range(n_boxes): node = Node( # Node computes pixel_bounds from box and transform _id=ids[i], parent_id=parent_ids[i] if level > 0 else -1, level=level, max_depth=self.max_depth, _box=shp_boxes[i], transform=raster.transform, stats=aggregates[i], ) idx.insert(ids[i], boxes_lvl[i], obj=node) all_nodes.append(node) # Prepare parent level boxes by unioning children bounds if level > 0: boxes, ix, iy = coarsen_from_children( boxes_lvl, ix_lvl, iy_lvl, divisions ) self.idx = idx def _load_quad_tree(self): try: self.idx = index.Index(self.index_path) except Exception as e: print(f"Error loading index: {e}") raise e def _delete_index_files(self): idx_file = f"{self.index_path}.idx" dat_file = f"{self.index_path}.dat" if os.path.exists(idx_file): os.remove(idx_file) if os.path.exists(dat_file): os.remove(dat_file) def _precomputations(self, features: gpd.GeoDataFrame, raster: rio.DatasetReader): if self._should_build_index(): if self._index_files_exist(): warnings.warn( f"Index files {self.index_path}.idx and {self.index_path}.dat already exist. They will be overwritten.", ) self._delete_index_files() self._compute_quad_tree(raster) else: self._load_quad_tree() self._compute_scanline_reading_table(features, raster) def _run(self, features: gpd.GeoDataFrame, raster: rio.DatasetReader): self._precomputations(features, raster) reading_table = self._reading_table f_index_starts = self.f_index_starts n_features = self.n_features n_eff_features = self.effective_n_features if reading_table.size == 0: # No intersections found, return empty stats return [self.stats.empty() for _ in range(n_features)] new_reading_table = [] partials = [[] for _ in range(n_features)] # Debugging # H, W = raster.height, raster.width # window = rio.windows.Window(0, 0, W, H).round_offsets().round_lengths() # for plotting/ref # global_mask = np.zeros((H, W), dtype=int) # global mask for debugging # used_nodes = [[] for _ in range(n_features)] # End Debugging for f_index, geom in enumerate(features.geometry): if f_index not in self.unique_f_indices: # Skip features that do not have any intersections # self.unique_f_indices was computed from the scanline reading table # so if this f_index does not appear, it means there are n intersections continue # Get all intersecting nodes, sorted from root to leaves # window for geom window_bounds = geom.bounds nodes = list(self.idx.intersection(window_bounds, objects=True))[::-1] nodes = [node.object for node in nodes] # TODO: one liner nodes = [node for node in nodes if node.is_contained_in_geom(geom)] nodes = sorted(nodes, key=lambda x: x.level) without_children = [] nids = set([int(n.id) for n in nodes]) for n in nodes: if int(n.parent_id) not in nids: without_children.append(n) nodes = without_children f_index_start = f_index_starts[f_index] f_index_end = ( f_index_starts[f_index + 1] if f_index + 1 < n_eff_features else len(reading_table) ) f_reading_table = reading_table[f_index_start:f_index_end] if len(nodes) == 0: new_reading_table.append(f_reading_table) continue # Debugging # put node 1 in position 0 # n1 = nodes[0] # n0 = nodes[0] # nodes[0] = nodes[1] # nodes[1] = n0 # End Debugging for node in nodes: n_r0, n_r1, n_c0, n_c1 = node.pixel_bounds # Debugging # Mark in global mask # sq_win = rio.windows.from_bounds( # n_x0, n_y0, n_x1, n_y1, transform=transform # ) # sq_win = sq_win.round_offsets().round_lengths() # inter = rio.windows.intersection(sq_win, window) # # mark in global mask # loc_row0 = int(round(inter.row_off - subset_row0)) # loc_col0 = int(round(inter.col_off - subset_col0)) # h = int(round(inter.height)) # w = int(round(inter.width)) # r1 = max(0, loc_row0); c1 = max(0, loc_col0) # r2 = min(global_mask.shape[0], r1 + h) # c2 = min(global_mask.shape[1], c1 + w) # if r1 < r2 and c1 < c2: # global_mask[r1:r2, c1:c2] = f_index + 1 # used_nodes[f_index].append(node) # ensure stats are correct # import rasterstats # assert rasterstats.zonal_stats(node.box, raster.name, stats="count")[0]['count'] == node.stats['count'] # End Debugging # Add the partials of this node partials[f_index].append(node.stats) rows = f_reading_table[:, 0] col0s = f_reading_table[:, 1] col1s = f_reading_table[:, 2] fully_inside = ( (rows >= n_r0) & (rows < n_r1) & (col0s >= n_c0) & (col1s <= n_c1) ) # Remove fully inside strips, as they are already accounted for # by the node f_reading_table = f_reading_table[~fully_inside] rows = f_reading_table[:, 0] col0s = f_reading_table[:, 1] col1s = f_reading_table[:, 2] f_idxs = f_reading_table[:, 3] new_entries = [] # Left cut: segments or strips crossing the node's left edge # includes both the strips only crossing the left edge # and segments crossing both the left and right edges Lmask = (rows >= n_r0) & (rows < n_r1) & (col0s < n_c0) & (col1s > n_c0) if np.any(Lmask): l_rows = rows[Lmask] l_c0 = col0s[Lmask] l_c1 = np.minimum(col1s[Lmask], n_c0) # cut at left edge keep = l_c1 > l_c0 # drop zero-length # "Cut" the left edge, creating a new shorter segment in the left part # that ends at the node's left edge if np.any(keep): left_reading = np.stack( [l_rows[keep], l_c0[keep], l_c1[keep], f_idxs[Lmask][keep]], axis=1, ).astype(int) new_entries.append(left_reading) # Right cut: segments crossing the node's right edge # includes both the strips only crossing the right edge # and segments crossing both the left and right edges Rmask = (rows >= n_r0) & (rows < n_r1) & (col1s > n_c1) & (col0s < n_c1) if np.any(Rmask): r_rows = rows[Rmask] r_c0 = np.maximum(col0s[Rmask], n_c1) # start at node's right edge r_c1 = col1s[Rmask] r_f = f_idxs[Rmask] # "Cut" the right edge, creating a new shorter segment in the right part # that starts at the node's right edge keep = r_c1 > r_c0 # drop zero-length pieces if np.any(keep): right_reading = np.stack( [r_rows[keep], r_c0[keep], r_c1[keep], r_f[keep]], axis=1 ).astype(int) new_entries.append(right_reading) # Keep everything that is outside the node as is # outside means up and down the node, or completely # starting and ending to the left or right of the node inside_rows = (rows >= n_r0) & (rows < n_r1) outside_mask = (~inside_rows) | (col1s <= n_c0) | (col0s >= n_c1) outside_rows = f_reading_table[outside_mask] # merge: outside + any left/right pieces parts = [outside_rows] for part in new_entries: if part is not None and part.size: # ensure int dtype and 2D shape parts.append(part.astype(int, copy=False).reshape(-1, 4)) f_reading_table = np.vstack(parts) if len(parts) > 1 else outside_rows # Debugging # step_reading_table = f_reading_table.copy() # # step_coord_table = f_coord_table.copy() # # add column indicating if it was outside_rows or new_entries # # was_outside = np.zeros(len(step_reading_table), dtype=bool) # # was_outside[:len(outside_rows)] = True # # sort new reading table by rows # # sort_idx = np.lexsort((new_reading_table[:, 3], new_reading_table[:, 0])) # step_sort_idx = np.argsort(step_reading_table[:, 0]) # step_reading_table = step_reading_table[step_sort_idx] # "Help me, Step Reading Table, I'm stuck...." # # step_coord_table = step_coord_table[step_sort_idx] # # was_outside = was_outside[step_sort_idx] # rows, row_starts = np.unique(step_reading_table[:, 0], return_index=True) # # yss = step_coord_table[:, 0] # # yss = yss[row_starts] # yss = (raster.transform * (np.zeros_like(rows), rows + 0.5))[1] # row_start, row_end = int(np.floor(window.row_off)), int( # np.ceil(window.row_off + window.height) # ) # col_start, col_end = int(np.floor(window.col_off)), int( # np.ceil(window.col_off + window.width) # ) # from raptorstats.debugutils import ref_mask_rasterstats, compare_stats, plot_mask_comparison # for i, row in enumerate(rows): # start = row_starts[i] # end = row_starts[i + 1] if i + 1 < len(row_starts) else len(step_reading_table) # reading_line = step_reading_table[start:end] # min_col = np.min(reading_line[:, 1]) # max_col = np.max(reading_line[:, 2]) # reading_window = rio.windows.Window( # col_off=min_col, row_off=row, width=max_col - min_col, height=1 # ) # curr_y = yss[i] # # if np.allclose(curr_y, 3851869.0157146556): # print(f"Processing row {row} (y={curr_y})") # # Does not handle nodata # data = raster.read(1, window=reading_window, masked=True) # if data.shape[0] == 0: # continue # data = data[0] # for j, col0, col1, f_index in reading_line: # row_in_mask = row - row_start # col0_in_mask = col0 - col_start # col1_in_mask = col1 - col_start # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = f_index + 1 # # if row == 512 or row == 479: # # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = -1 # # if not was_outside[i]: # # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = -1 # # else: # # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = -1 # # global_mask = global_mask[:-1, :] # remove the last row added for debugging # ref_mask = ref_mask_rasterstats(features, raster, window) # print(f"Node Level: {node.level}") # # plot_mask_comparison(global_mask, ref_mask, features, raster.transform, window=window, scanlines=yss, idx=self.idx, used_nodes=used_nodes[f_index]) # global_mask = np.zeros((H, W), dtype=int) # reset global mask for next node # print('done') # End Debugging new_reading_table.append(f_reading_table) # Sort the lines by row so we can process them using scanline new_reading_table = np.vstack(new_reading_table) sort_idx = np.argsort(new_reading_table[:, 0]) reading_table = new_reading_table[sort_idx] self.results = process_reading_table( reading_table, features, raster, self.stats, partials=partials ) # Debugging # rows, row_starts = np.unique(reading_table[:, 0], return_index=True) # row_start, row_end = int(np.floor(window.row_off)), int( # np.ceil(window.row_off + window.height) # ) # col_start, col_end = int(np.floor(window.col_off)), int( # np.ceil(window.col_off + window.width) # ) # from raptorstats.debugutils import ref_mask_rasterstats, compare_stats, plot_mask_comparison # diffs = compare_stats(self.results, self.raster.files[0], features, stats=self.stats, show_diff=True, precision=5) # # diffs = [] # if len(diffs) == 0: # print("No differences found!!!") # return self.results # diff_indices = [d['feature'] for d in diffs] # diff_features = features.iloc[diff_indices] # f_index_to_diff_features = { f: i for i, f in enumerate(diff_indices) } # for i, row in enumerate(rows): # start = row_starts[i] # end = row_starts[i + 1] if i + 1 < len(row_starts) else len(reading_table) # reading_line = reading_table[start:end] # min_col = np.min(reading_line[:, 1]) # max_col = np.max(reading_line[:, 2]) # reading_window = rio.windows.Window( # col_off=min_col, row_off=row, width=max_col - min_col, height=1 # ) # # Does not handle nodata # data = raster.read(1, window=reading_window, masked=True) # if data.shape[0] == 0: # continue # data = data[0] # for j, col0, col1, f_index in reading_line: # row_in_mask = row - row_start # col0_in_mask = col0 - col_start # col1_in_mask = col1 - col_start # if len(diff_indices) > 0: # if f_index in diff_indices: # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = f_index_to_diff_features[f_index] + 1 # else: # global_mask[row_in_mask, col0_in_mask:col1_in_mask] = f_index + 1 # global_mask = global_mask[:-1, :] # remove the last row added for debugging # ref_mask = ref_mask_rasterstats(diff_features if len(diff_indices) > 0 else features, raster, window) # diff_used_nodes = [] # for di in diff_indices: # diff_used_nodes.append(used_nodes[di]) # diff_used_nodes = [n for nlist in diff_used_nodes for n in nlist] # used_nodes= [ n for nlist in used_nodes for n in nlist ] # # used_nodes = [] # print(diffs) # plot_mask_comparison(global_mask, ref_mask, diff_features if len(diff_indices) > 0 else features, raster.transform, window=window, scanlines=inter_ys, idx=self.idx, used_nodes=diff_used_nodes if len(diff_indices) >0 else used_nodes) # print('done') # End Debugging code return self.results