Source code for tiled.structures.array

import enum
import os
import sys
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy


[docs] class Endianness(str, enum.Enum): """ An enum of endian values: big, little, not_applicable. """ big = "big" little = "little" not_applicable = "not_applicable"
class ObjectArrayTypeDisabled(ValueError): pass
[docs] class Kind(str, enum.Enum): """ See https://numpy.org/devdocs/reference/arrays.interface.html#object.__array_interface__ The term "kind" comes from the numpy API as well. Note: At import time, the environment variable ``TILED_ALLOW_OBJECT_ARRAYS`` is checked. If it is set to anything other than ``"0"``, then this Enum gets an additional member:: object = "O" to support numpy 'object'-type arrays which hold generic Python objects. Numpy 'object'-type arrays are not enabled by default because their binary representation is not interpretable by clients other than Python. It is recommended to convert your data to a non-object type if possible so that it can be read by non-Python clients. """ bit_field = "t" boolean = "b" integer = "i" unsigned_integer = "u" floating_point = "f" complex_floating_point = "c" timedelta = "m" datetime = "M" string = "S" # fixed-length sequence of char unicode = "U" # fixed-length sequence of Py_UNICODE other = "V" # "V" is for "void" -- generic fixed-size chunk of memory # By default, do not tolerate numpy objectg arrays if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0": object = "O" # Object (i.e. the memory contains a pointer to PyObject) @classmethod def _missing_(cls, key): if key == "O": raise ObjectArrayTypeDisabled( "Numpy 'object'-type arrays are not enabled by default " "because their binary representation is not interpretable " "by clients other than Python. " "It is recommended to convert your data to a non-object type " "if possible so that it can be read by non-Python clients. " "If this is not possible, you may enable 'object'-type arrays " "by setting the environment variable TILED_ALLOW_OBJECT_ARRAYS=1 " "in the server." )
[docs] @dataclass class BuiltinDtype: endianness: Endianness kind: Kind itemsize: int dt_units: Optional[str] = None __endianness_map = { ">": "big", "<": "little", "=": sys.byteorder, "|": "not_applicable", } __endianness_reverse_map = {"big": ">", "little": "<", "not_applicable": "|"} @classmethod def from_numpy_dtype(cls, dtype) -> "BuiltinDtype": # Extract datetime units from the dtype string representation, # e.g. `'<M8[ns]'` has `dt_units = '[ns]'`. Count determines the number of base units in a step. dt_units = None if dtype.kind in ("m", "M"): unit, count = numpy.datetime_data(dtype) dt_units = f"[{count if count > 1 else ''}{unit}]" return cls( endianness=cls.__endianness_map[dtype.byteorder], kind=Kind(dtype.kind), itemsize=dtype.itemsize, dt_units=dt_units, ) def to_numpy_dtype(self) -> numpy.dtype: return numpy.dtype(self.to_numpy_str()) def to_numpy_str(self): endianness = self.__endianness_reverse_map[self.endianness] # dtype.itemsize always reports bytes. The format string from the # numeric types the string format is: {type_code}{byte_count} so we can # directly use the item size. # # for unicode the pattern is 'U{char_count}', however # which numpy always represents as 4 byte UCS4 encoding # (because variable width encodings do not fit with fixed-stride arrays) # so the reported itemsize is 4x the char count. To get back to the string # we need to divide by 4. size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4 return f"{endianness}{self.kind.value}{size}{self.dt_units or ''}" @classmethod def from_json(cls, structure): return cls( kind=Kind(structure["kind"]), itemsize=structure["itemsize"], endianness=Endianness(structure["endianness"]), dt_units=structure.get("dt_units"), )
@dataclass class Field: name: str dtype: Union[BuiltinDtype, "StructDtype"] shape: Optional[Tuple[int, ...]] @classmethod def from_numpy_descr(cls, field): name, *rest = field if name == "": raise ValueError( f"You seem to have gotten descr of a base or subdtype: {field}" ) if len(rest) == 1: (f_type,) = rest shape = None else: f_type, shape = rest if isinstance(f_type, str): FType = BuiltinDtype.from_numpy_dtype(numpy.dtype(f_type)) else: FType = StructDtype.from_numpy_dtype(numpy.dtype(f_type)) return cls(name=name, dtype=FType, shape=shape) def to_numpy_descr(self): if isinstance(self.dtype, BuiltinDtype): base = [self.name, self.dtype.to_numpy_str()] else: base = [self.name, self.dtype.to_numpy_descr()] if self.shape is None: return tuple(base) else: return tuple(base + [self.shape]) @classmethod def from_json(cls, structure): name = structure["name"] if "fields" in structure["dtype"]: ftype = StructDtype.from_json(structure["dtype"]) else: ftype = BuiltinDtype.from_json(structure["dtype"]) return cls(name=name, dtype=ftype, shape=structure["shape"]) @dataclass class StructDtype: itemsize: int fields: List[Field] @classmethod def from_numpy_dtype(cls, dtype): # subdtypes push extra dimensions into arrays, we should handle these # a layer up and report an array with bigger dimensions. if dtype.subdtype is not None: raise ValueError(f"We do not know how to encode subdtypes: {dtype}") # If this is a builtin type, require the use of BuiltinDtype (nee .array.BuiltinDtype) if dtype.fields is None: raise ValueError(f"You have a base type: {dtype}") return cls( itemsize=dtype.itemsize, fields=[Field.from_numpy_descr(f) for f in dtype.descr], ) def to_numpy_dtype(self): return numpy.dtype(self.to_numpy_descr()) def to_numpy_descr(self): return [f.to_numpy_descr() for f in self.fields] def max_depth(self): return max( 1 if isinstance(f.dtype, BuiltinDtype) else 1 + f.dtype.max_depth() for f in self.fields ) @classmethod def from_json(cls, structure): return cls( itemsize=structure["itemsize"], fields=[Field.from_json(f) for f in structure["fields"]], )
[docs] @dataclass class ArrayStructure: data_type: Union[BuiltinDtype, StructDtype] chunks: Tuple[Tuple[int, ...], ...] # tuple-of-tuples-of-ints like ((3,), (3,)) shape: Tuple[int, ...] # tuple of ints like (3, 3) dims: Optional[Tuple[str, ...]] = None # None or tuple of names like ("x", "y") resizable: Union[bool, Tuple[bool, ...]] = False @classmethod def from_json(cls, structure): if "fields" in structure["data_type"]: data_type = StructDtype.from_json(structure["data_type"]) else: data_type = BuiltinDtype.from_json(structure["data_type"]) dims = structure["dims"] if dims is not None: dims = tuple(dims) return cls( data_type=data_type, chunks=tuple(map(tuple, structure["chunks"])), shape=tuple(structure["shape"]), dims=dims, resizable=structure.get("resizable", False), ) @classmethod def from_array(cls, array, shape=None, chunks=None, dims=None) -> "ArrayStructure": from dask.array.core import normalize_chunks if not hasattr(array, "__array__"): # may be a list of something; convert to array array = numpy.asanyarray(array) # Why would shape ever be different from array.shape, you ask? # Some formats (notably Zarr) force shape to be a multiple of # a chunk size, such that array.shape may include a margin beyond the # actual data. if shape is None: shape = array.shape if chunks is None: if hasattr(array, "chunks"): chunks = array.chunks # might be None else: chunks = None if chunks is None: chunks = ("auto",) * len(shape) normalized_chunks = normalize_chunks( chunks, shape=shape, dtype=array.dtype, ) if array.dtype.fields is not None: data_type = StructDtype.from_numpy_dtype(array.dtype) else: data_type = BuiltinDtype.from_numpy_dtype(array.dtype) return ArrayStructure( data_type=data_type, shape=shape, chunks=normalized_chunks, dims=dims, )