Source code for tiled.structures.array

import enum
import os
import sys
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, ClassVar, List, Optional, Tuple, Union

import numpy

from tiled.structures.root import Structure

# from dtype.descr
FieldDescr = Union[Tuple[str, str], Tuple[str, str, Tuple[int, ...]]]
NumpyDescr = List[FieldDescr]



[docs]
class Endianness(str, enum.Enum):
    """
    An enum of endian values: big, little, not_applicable.
    """

    big = "big"
    little = "little"
    not_applicable = "not_applicable"



class ObjectArrayTypeDisabled(ValueError):
    pass



[docs]
class Kind(str, enum.Enum):
    """
    See https://numpy.org/devdocs/reference/arrays.interface.html#object.__array_interface__

    The term "kind" comes from the numpy API as well.

    Note: At import time, the environment variable ``TILED_ALLOW_OBJECT_ARRAYS``
    is checked. If it is set to anything other than ``"0"``, then this
    Enum gets an additional member::

        object = "O"

    to support numpy 'object'-type arrays which hold generic Python objects.
    Numpy 'object'-type arrays are not enabled by default because their binary
    representation is not interpretable by clients other than Python.  It is
    recommended to convert your data to a non-object type if possible so that it
    can be read by non-Python clients.
    """

    bit_field = "t"
    boolean = "b"
    integer = "i"
    unsigned_integer = "u"
    floating_point = "f"
    complex_floating_point = "c"
    timedelta = "m"
    datetime = "M"
    string = "S"  # fixed-length sequence of char
    unicode = "U"  # fixed-length sequence of Py_UNICODE
    other = "V"  # "V" is for "void" -- generic fixed-size chunk of memory

    # By default, do not tolerate numpy objectg arrays
    if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0":
        object = "O"  # Object (i.e. the memory contains a pointer to PyObject)

    @classmethod
    def _missing_(cls, key: str):
        if key == "O":
            raise ObjectArrayTypeDisabled(
                "Numpy 'object'-type arrays are not enabled by default "
                "because their binary representation is not interpretable "
                "by clients other than Python. "
                "It is recommended to convert your data to a non-object type "
                "if possible so that it can be read by non-Python clients. "
                "If this is not possible, you may enable 'object'-type arrays "
                "by setting the environment variable TILED_ALLOW_OBJECT_ARRAYS=1 "
                "in the server."
            )




[docs]
@dataclass
class BuiltinDtype:
    endianness: Endianness
    kind: Kind
    itemsize: int
    dt_units: Optional[str] = None

    _endianness_map: ClassVar[Mapping[str, str]] = {
        ">": "big",
        "<": "little",
        "=": sys.byteorder,
        "|": "not_applicable",
    }

    _endianness_reverse_map: ClassVar[Mapping[str, str]] = {
        v: k for k, v in _endianness_map.items() if k != "="
    }

    @classmethod
    def from_numpy_dtype(cls, dtype: numpy.dtype) -> "BuiltinDtype":
        # Extract datetime units from the dtype string representation,
        # e.g. `'<M8[ns]'` has `dt_units = '[ns]'`. Count determines the number of base units in a step.
        dt_units = None
        if dtype.kind in ("m", "M"):
            unit, count = numpy.datetime_data(dtype)
            dt_units = f"[{count if count > 1 else ''}{unit}]"

        return cls(
            endianness=cls._endianness_map[dtype.byteorder],
            kind=Kind(dtype.kind),
            itemsize=dtype.itemsize,
            dt_units=dt_units,
        )

    def to_numpy_dtype(self) -> numpy.dtype:
        return numpy.dtype(self.to_numpy_str())

    def to_numpy_str(self) -> str:
        endianness = self._endianness_reverse_map[self.endianness]
        # dtype.itemsize always reports bytes.  The format string from the
        # numeric types the string format is: {type_code}{byte_count} so we can
        # directly use the item size.
        #
        # for unicode the pattern is 'U{char_count}', however
        # which numpy always represents as 4 byte UCS4 encoding
        # (because variable width encodings do not fit with fixed-stride arrays)
        # so the reported itemsize is 4x the char count.  To get back to the string
        # we need to divide by 4.
        size = self.itemsize if self.kind != Kind.unicode else self.itemsize // 4
        return f"{endianness}{self.kind.value}{size}{self.dt_units or ''}"

    def to_numpy_descr(self):
        "An alias for to_numpy_str() to match the StructDtype interface."
        return self.to_numpy_str()

    @classmethod
    def from_json(cls, structure: Mapping[str, Any]) -> "BuiltinDtype":
        return cls(
            kind=Kind(structure["kind"]),
            itemsize=structure["itemsize"],
            endianness=Endianness(structure["endianness"]),
            dt_units=structure.get("dt_units"),
        )



@dataclass
class Field:
    name: str
    dtype: Union[BuiltinDtype, "StructDtype"]
    shape: Optional[Tuple[int, ...]]

    @classmethod
    def from_numpy_descr(cls, field: FieldDescr) -> "Field":
        name, *rest = field
        if name == "":
            raise ValueError(
                f"You seem to have gotten descr of a base or subdtype: {field}"
            )
        if len(rest) == 1:
            (f_type,) = rest
            shape = None
        else:
            f_type, shape = rest

        if isinstance(f_type, str):
            FType = BuiltinDtype.from_numpy_dtype(numpy.dtype(f_type))
        else:
            FType = StructDtype.from_numpy_dtype(numpy.dtype(f_type))
        return cls(name=name, dtype=FType, shape=shape)

    def to_numpy_descr(self) -> FieldDescr:
        if isinstance(self.dtype, BuiltinDtype):
            base = [self.name, self.dtype.to_numpy_str()]
        else:
            base = [self.name, self.dtype.to_numpy_descr()]
        if self.shape is None:
            return tuple(base)
        else:
            return tuple(base + [self.shape])

    @classmethod
    def from_json(cls, structure: Mapping[str, Any]) -> "Field":
        name = structure["name"]
        if "fields" in structure["dtype"]:
            ftype = StructDtype.from_json(structure["dtype"])
        else:
            ftype = BuiltinDtype.from_json(structure["dtype"])
        return cls(name=name, dtype=ftype, shape=structure["shape"])


@dataclass
class StructDtype:
    itemsize: int
    fields: List[Field]

    @classmethod
    def from_numpy_dtype(cls, dtype: numpy.dtype) -> "StructDtype":
        # subdtypes push extra dimensions into arrays, we should handle these
        # a layer up and report an array with bigger dimensions.
        if dtype.subdtype is not None:
            raise ValueError(f"We do not know how to encode subdtypes: {dtype}")
        # If this is a builtin type, require the use of BuiltinDtype (nee .array.BuiltinDtype)
        if dtype.fields is None:
            raise ValueError(f"You have a base type: {dtype}")
        return cls(
            itemsize=dtype.itemsize,
            fields=[Field.from_numpy_descr(f) for f in dtype.descr],
        )

    def to_numpy_dtype(self) -> numpy.dtype:
        return numpy.dtype(self.to_numpy_descr())

    def to_numpy_descr(self) -> NumpyDescr:
        return [f.to_numpy_descr() for f in self.fields]

    def max_depth(self) -> int:
        return max(
            1 if isinstance(f.dtype, BuiltinDtype) else 1 + f.dtype.max_depth()
            for f in self.fields
        )

    @classmethod
    def from_json(cls, structure: Mapping[str, Any]) -> "StructDtype":
        return cls(
            itemsize=structure["itemsize"],
            fields=[Field.from_json(f) for f in structure["fields"]],
        )



[docs]
@dataclass
class ArrayStructure(Structure):
    data_type: Union[BuiltinDtype, StructDtype]
    chunks: Tuple[Tuple[int, ...], ...]  # tuple-of-tuples-of-ints like ((3,), (3,))
    shape: Tuple[int, ...]  # tuple of ints like (3, 3)
    dims: Optional[Tuple[str, ...]] = None  # None or tuple of names like ("x", "y")
    resizable: Union[bool, Tuple[bool, ...]] = False

    @classmethod
    def from_json(cls, structure: Mapping[str, Any]) -> "ArrayStructure":
        if "fields" in structure["data_type"]:
            data_type = StructDtype.from_json(structure["data_type"])
        else:
            data_type = BuiltinDtype.from_json(structure["data_type"])
        dims = structure["dims"]
        if dims is not None:
            dims = tuple(dims)
        return cls(
            data_type=data_type,
            chunks=tuple(map(tuple, structure["chunks"])),
            shape=tuple(structure["shape"]),
            dims=dims,
            resizable=structure.get("resizable", False),
        )

    @classmethod
    def from_array(cls, array, shape=None, chunks=None, dims=None) -> "ArrayStructure":
        from dask.array.core import normalize_chunks

        if not hasattr(array, "__array__"):
            # may be a list of something; convert to array
            array = numpy.asanyarray(array)

        # Why would shape ever be different from array.shape, you ask?
        # Some formats (notably Zarr) force shape to be a multiple of
        # a chunk size, such that array.shape may include a margin beyond the
        # actual data.
        if shape is None:
            shape = array.shape
        if chunks is None:
            if hasattr(array, "chunks"):
                chunks = array.chunks  # might be None
            else:
                chunks = None
            if chunks is None:
                chunks = ("auto",) * len(shape)
        normalized_chunks = normalize_chunks(
            chunks,
            shape=shape,
            dtype=array.dtype,
        )
        if array.dtype.fields is not None:
            data_type = StructDtype.from_numpy_dtype(array.dtype)
        else:
            data_type = BuiltinDtype.from_numpy_dtype(array.dtype)
        return cls(
            data_type=data_type,
            shape=shape,
            chunks=normalized_chunks,
            dims=dims,
        )