Binary data with lazy loading#
Author: Aureliana Barghini (B-Open)
If you want to make your backend effective with big datasets, then you should
support lazy loading.
For doing that you need:
Implement
_raw_indexing_method
for reading blocks form diskImplement some glue code to make it work with Xarray:
put your
_raw_indexing_method
in aBackendArray
subclassreplace the
numpy.ndarray
inside your dataset with your subclass ofBackendArray
Create sample files#
import os
import dask
import numpy as np
import xarray as xr
arr = np.arange(30000000, dtype=np.int64)
with open("foo.bin", "w") as f:
arr.tofile(f)
arr = np.arange(30000000, dtype=np.float64)
with open("foo_float.bin", "w") as f:
arr.tofile(f)
BinaryBackendArray#
The BackendArray subclass shall implement the following method and attributes:
_raw_indexing_method
method, supporting item selection and slicing__getitem__
that wraps_raw_indexing_method
with an xarray helper functionexplicit_indexing_adapter
(threadsafe)shape
attributedtype
attribute.
class BinaryBackendArray(xr.backends.BackendArray):
def __init__(
self,
filename_or_obj,
shape,
dtype,
lock,
):
self.filename_or_obj = filename_or_obj
self.shape = shape
self.dtype = dtype
self.lock = lock
def __getitem__(self, key: tuple):
return xr.core.indexing.explicit_indexing_adapter(
key,
self.shape,
xr.core.indexing.IndexingSupport.BASIC,
self._raw_indexing_method,
)
def _raw_indexing_method(self, key: tuple):
key0 = key[0]
size = np.dtype(self.dtype).itemsize
if isinstance(key0, slice):
start = key0.start or 0
stop = key0.stop or self.shape[0]
offset = size * start
count = stop - start
else:
offset = size * key0
count = 1
with self.lock, open(self.filename_or_obj) as f:
arr = np.fromfile(f, np.int64, offset=offset, count=count)
if isinstance(key, int):
arr = arr.squeeze()
return arr
BinaryBackend Entrypoint#
class BinaryBackend(xr.backends.BackendEntrypoint):
def open_dataset(self, filename_or_obj, *, drop_variables=None, dtype=np.int64):
size = np.dtype(dtype).itemsize
shape = os.stat(filename_or_obj).st_size // size
backend_array = BinaryBackendArray(
filename_or_obj=filename_or_obj,
shape=(shape,),
dtype=dtype,
lock=dask.utils.SerializableLock(),
)
data = xr.core.indexing.LazilyIndexedArray(backend_array)
var = xr.Variable(dims=("x"), data=data)
return xr.Dataset({"foo": var})
Reduced memory usage with dask#
arr = xr.open_dataarray("foo.bin", engine=BinaryBackend, chunks=10000)
arr
<xarray.DataArray 'foo' (x: 30000000)> Size: 240MB dask.array<open_dataset-foo, shape=(30000000,), dtype=int64, chunksize=(10000,), chunktype=numpy.ndarray> Dimensions without coordinates: x
arr.mean()
<xarray.DataArray 'foo' ()> Size: 8B dask.array<mean_agg-aggregate, shape=(), dtype=float64, chunksize=(), chunktype=numpy.ndarray>
arr.sel(x=slice(0, 10))
<xarray.DataArray 'foo' (x: 10)> Size: 80B dask.array<getitem, shape=(10,), dtype=int64, chunksize=(10,), chunktype=numpy.ndarray> Dimensions without coordinates: x
arr.sel(x=slice(0, 10)).compute()
<xarray.DataArray 'foo' (x: 10)> Size: 80B array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) Dimensions without coordinates: x
arr.load()
<xarray.DataArray 'foo' (x: 30000000)> Size: 240MB array([ 0, 1, 2, ..., 29999997, 29999998, 29999999]) Dimensions without coordinates: x