Source code for tiled.structures.dataframe

import base64
import io
from dataclasses import dataclass
from typing import List, Tuple, Union

B64_ENCODED_PREFIX = "data:application/vnd.apache.arrow.file;base64,"


[docs]@dataclass class DataFrameStructure: # This holds a Arrow schema, base64-encoded so that it can be transported # as JSON. For clarity, the encoded data (...) is prefixed like: # # data:application/vnd.apache.arrow.file;base64,... # # Arrow does not support an official JSON serialization, but it # could in the future: https://github.com/apache/arrow/pull/7110 # If it does, we could switch to using that here. arrow_schema: str npartitions: int columns: List[str] resizable: Union[bool, Tuple[bool, ...]] = False @classmethod def from_dask_dataframe(cls, ddf): import dask.dataframe.utils import pyarrow # Make a pandas DataFrame with 0 rows. # We can use this to define an Arrow schema without loading any row data. meta = dask.dataframe.utils.make_meta(ddf) schema_bytes = pyarrow.Table.from_pandas(meta).schema.serialize() schema_b64 = base64.b64encode(schema_bytes).decode("utf-8") data_uri = B64_ENCODED_PREFIX + schema_b64 return cls( arrow_schema=data_uri, npartitions=ddf.npartitions, columns=list(ddf.columns), ) @classmethod def from_pandas(cls, df): import pyarrow schema_bytes = pyarrow.Table.from_pandas(df).schema.serialize() schema_b64 = base64.b64encode(schema_bytes).decode("utf-8") data_uri = B64_ENCODED_PREFIX + schema_b64 return cls(arrow_schema=data_uri, npartitions=1, columns=list(df.columns)) @property def arrow_schema_decoded(self): import pyarrow if not self.arrow_schema.startswith(B64_ENCODED_PREFIX): raise ValueError( f"Expected base64-encoded data prefixed with {B64_ENCODED_PREFIX}." ) payload = self.arrow_schema[len(B64_ENCODED_PREFIX) :].encode( # noqa: 203 "utf-8" ) return pyarrow.ipc.read_schema(io.BytesIO(base64.b64decode(payload))) @property def meta(self): return self.arrow_schema_decoded.empty_table().to_pandas() @classmethod def from_json(cls, content): return cls( arrow_schema=content["arrow_schema"], npartitions=content["npartitions"], columns=content["columns"], resizable=content["resizable"], )