Source code for tiled.readers.excel

import dask.dataframe
import pandas

from ..server.object_cache import NO_CACHE, get_object_cache, with_object_cache
from ..trees.in_memory import Tree
from .dataframe import DataFrameAdapter


[docs]class ExcelReader(Tree): """ Read the sheets in an Excel file. This maps the Excel file, which may contain one of more spreadsheets, onto a "Tree" of tabular structures. Examples -------- Given a file path >>> ExcelReader.from_file("path/to/excel_file.xlsx") Given a file object >>> file = open("path/to/excel_file.xlsx") >>> ExcelReader.from_file(file) Given a pandas.ExcelFile object >>> import pandas >>> ef = pandas.ExcelFile(file) >>> ExcelReader.from_file(ef) """ @classmethod def from_file(cls, file): if isinstance(file, pandas.ExcelFile): excel_file = file else: excel_file = pandas.ExcelFile(file) # If an instance has previously been created using the same parameters, # then we are here because the caller wants a *fresh* view on this data. # Therefore, we should clear any cached data. cache = get_object_cache() mapping = {} for sheet_name in excel_file.sheet_names: cache_key = (cls.__module__, cls.__qualname__, file, sheet_name) ddf = dask.dataframe.from_pandas( with_object_cache(cache_key, excel_file.parse, sheet_name), npartitions=1, # TODO Be smarter about this. ) if cache is not NO_CACHE: cache.discard(cache_key) # parsed sheet content cache.discard_dask(ddf.__dask_keys__()) # dask tasks mapping[sheet_name] = DataFrameAdapter(ddf) return cls(mapping)