Source code for sisl.utils._arrays

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
from __future__ import annotations

"""Special handlers for arrays """
import numpy as np

__all__ = ["batched_indices"]


[docs] def batched_indices(ref, y, axis=-1, atol=1e-8, batch_size=200, diff_func=None): """Locate `x` in `ref` by examining ``np.abs(diff_func(ref - y)) <= atol`` This method is necessary for very large groups of data since the ``ref-y`` calls will use broad-casting to create very large memory chunks. This method will allow broad-casting arrays up to a size of `batch_size`. The memory is calculating using ``np.prod(ref.shape) / ref.shape[axis] * n`` such that ``n`` chunks of `y` is using the `batch_size` MB of memory. ``n`` will minimally be 1, regardless of `batch_size`. Parameters ---------- ref : array_like reference array where we wish to locate the indices of `y` in y : array_like of 1D or 2D array to locate in `ref`. For 2D arrays and `axis` not None, axis : int or None, optional which axis to do a logical reduction along, if `None` it means that they are 1D arrays and no axis will be reduced, i.e. same as ``ref.ravel() - y.reshape(-1, 1)`` but retaining indices of the same dimension as `ref`. atol : float or array_like, optional absolute tolerance for comparing values, for array_like values it must have the same length as ``ref.shape[axis]`` batch_size : float, optional maximum size of broad-casted array. Internal algorithms will determine the chunk size of `y` diff_func : callable, optional function to post-process the difference values, defaults to do nothing. Should have an interface ``def diff_func(diff)`` Returns ------- tuple of ndarray: the indices for each `ref` dimension that matches the values in `y`. the returned indices behave like `numpy.nonzero`. """ # first ensure the arrays are numpy arrays ref = np.asarray(ref) y = np.asarray(y) # atol may be an array of the same dimension as the axis atol = np.asarray(atol) # determine best chunk-size of `y` n = batch_size / (np.prod(ref.shape) * ref.itemsize / 1024**2) n = max(1, n) if diff_func is None: def diff_func(d): """Do nothing""" return d def yield_cs(n, size): n = max(1, int(n)) i = 0 for j in range(n, size, n): yield range(i, j) i += n if i < size: yield range(i, size) indices = [] # determine the batch size if axis is None: if atol.size != 1: raise ValueError( f"batched_indices: for 1D comparisons atol can only be a single number." ) if y.ndim != 1: raise ValueError( f"batched_indices: axis is None and y.ndim != 1 ({y.ndim}). For ravel comparisons the " "dimensionality of y must be 1D." ) # a 1D array comparison # we might as well ravel y (here to ensure we do not # overwrite the input y's shape n = min(n, y.size) # create shapes y = np.expand_dims(y, tuple(range(ref.ndim))) ref = np.expand_dims(ref, ref.ndim) # b-cast size is for idx in yield_cs(n, y.size): idx = (np.abs(diff_func(ref - y[..., idx])) <= atol).nonzero()[:-1] indices.append(idx) # concatenate each indices array return tuple(map(np.concatenate, zip(*indices))) # Axis is specified # y must have 2 dimensions (or 1 with the same size as ref.shape[axis]) if y.ndim == 1: if y.size != ref.shape[axis]: raise ValueError( f"batched_indices: when y is a single value it must have same length as ref.shape[axis]" ) y = y.reshape(1, -1) elif y.ndim == 2: if y.shape[1] != ref.shape[axis]: raise ValueError( f"batched_indices: the comparison axis of y (y[0, :]) should have the same length as ref.shape[axis]" ) else: raise ValueError(f"batched_indices: y should be either 1D or 2D") # create shapes, since we change the shapes we must fix the axis' if axis < 0: axis = axis + ref.ndim y = np.expand_dims(y, tuple(range(ref.ndim - 1))) y = np.moveaxis(y, -1, axis) ref = np.expand_dims(ref, ref.ndim) if atol.size > 1: if atol.size != ref.shape[axis]: raise ValueError( f"batched_indices: atol size does not match the axis {axis} for ref argument." ) atol = np.expand_dims(atol.ravel(), tuple(range(ref.ndim - 1))) atol = np.moveaxis(atol, -1, axis) # b-cast size is for idx in yield_cs(n, y.shape[-1]): idx = np.logical_and.reduce( np.abs(diff_func(ref - y[..., idx])) <= atol, axis=axis ).nonzero()[:-1] indices.append(idx) # concatenate each indices array return tuple(map(np.concatenate, zip(*indices)))