Source code for emat.model.core_files.parsers


import os
import abc
import numpy as np
import pandas as pd
from typing import Mapping

from ...util.loggers import get_module_logger
_logger = get_module_logger(__name__)

[docs]class FileParser(abc.ABC): """ A tool to parse performance measure(s) from an arbitrary file format. This is an abstract base class, which defines the basic API for file parsing objects. Most users will want to use `TableParser` for reading perforamce measures from any kind of file that contains a table of data (including one-column, one-row, and one-value tables). Args: filename (str): The name of the file in which the measure(s) are stored. The filename is a relative path to the file, and will be evaluated relative to the `from_dir` argument in the `read` method. """ def __init__( self, filename, ): self.filename = filename
[docs] @abc.abstractmethod def read(self, from_dir): """ Read the performance measures. Args: from_dir (Path-like): The base directory from which to read the data. Returns: Dict: The measures read from this file. """ pass
@property @abc.abstractmethod def measure_names(self): """ List: the measure names contained in this TableParser. """ pass def __repr__(self): return f'<emat.model.core_files.{self.__class__.__name__} for "{self.filename}">'
[docs]class TableParser(FileParser): """ A tool to parse performance measure from an arbitrary table format. This object provides a way to systematically extract values from an output file that has a well defined name and format. This is exactly what we would expect for a files-based core model, which when run (and post-processed, if applicable) will generate one or more named and regularly formatted output files. Args: filename (str): The name of the file in which the tabular data is stored. The filename is a relative path to the file, and will be evaluated relative to the `from_dir` argument in the `read` method. Generally the `from_dir` will be a directory containing a set of model output files from a single model run, and this `filename` will be just the name of the file, unless the core model run constructs a sub-directory hierarchy within the output directory (this is unusual). measure_getters (Mapping[str, Getter]): A mapping that relates scalar performance measure values to Getters that extract values from the tabular data. reader_method (Callable, default pandas.read_csv): A function that accepts one positional argument (the filename to be read) and optionally some keyword arguments, and returns a pandas.DataFrame. handle_errors (str, default 'raise'): How to handle errors when reading a table, one of {'raise', 'nan'} **kwargs (Mapping, optional): A set of fixed keyword arguments that will be passed to `reader_method` each time it is called. """ def __init__( self, filename, measure_getters, reader_method = pd.read_csv, handle_errors = 'raise', **kwargs, ): super().__init__(filename) if not isinstance(measure_getters, Mapping): raise TypeError('measure_getters must be a mapping') self.measure_getters = measure_getters self.reader_method = reader_method self.reader_kwargs = kwargs if handle_errors not in {'raise','nan'}: raise ValueError("handle_errors not in {'raise', 'nan'}") self.handle_errors = handle_errors
[docs] def raw(self, from_dir): """ Read the raw tabular data. This method will read the raw file, using the `reader_method` defined for this `TableParser` and any designated keyword arguments for that reader, but it will not actually run any of the `measure_getters` that convert the table into individual performance measures. This method is exposed for users primarily to test be able to conveniently test `TableParser` objects during development. Args: from_dir (Path-like): The base directory from which to read the data. Returns: pandas.DataFrame """ f = os.path.join(from_dir, self.filename) if not os.path.exists(f): raise FileNotFoundError(f) return self.reader_method( f, **self.reader_kwargs, )
[docs] def read(self, from_dir): """ Read the performance measures. Args: from_dir (Path-like): The base directory from which to read the data. Returns: Dict: The measures read from this file. """ data = self.raw(from_dir) result = {} for measure_name, getter in self.measure_getters.items(): try: result[measure_name] = getter(data) except: if self.handle_errors == 'nan': _logger.exception(f"Error in reading {os.path.join(from_dir, self.filename)}") result[measure_name] = np.nan else: _logger.error(f"Error in reading {os.path.join(from_dir, self.filename)}") _logger.error(f" table shape {data.shape}") _logger.error(f" index {data.index}") _logger.error(f" columns {data.columns}") raise return result
@property def measure_names(self): """ List: the measure names contained in this TableParser. """ return sorted(self.measure_getters.keys())
### def slice_repr(x): if isinstance(x, slice): if x.start is None and x.stop is not None: r = f":{x.stop}" elif x.start is not None and x.stop is None: r = f"{x.start}:" elif x.start is not None and x.stop is not None: r = f"{x.start}:{x.stop}" else: r = ":" if x.step is not None: r += f":{x.step}" return r else: return repr(x) def tuple_repr_with_slice(xx): return ",".join(slice_repr(x) for x in xx)
[docs]class Getter: """ A tool to get defined value[s] from a pandas.DataFrame. Use a getter by calling it with the DataFrame as the sole argument. """ def __call__(self, x): raise NotImplementedError
class SingleGetter(Getter): def __init__(self, *item): self._item = item def __repr__(self): clsname = self.__class__.__name__[1:].replace('Neg','-').lower() return f"{clsname}[{tuple_repr_with_slice(self._item)}]" def __add__(self, other): return SumOfGetter(self, other) def __sub__(self, other): return SumOfGetter(self, -other) class SumOfGetter(Getter): def __init__(self, *parts): self._parts = list(parts) def __call__(self, x): return sum(p(x) for p in self._parts) def __repr__(self): return (" + ".join(repr(x) for x in self._parts)).replace(" + -", " - ") def __add__(self, other): return SumOfGetter(*self._parts, other) def __sub__(self, other): return SumOfGetter(*self._parts, -other) class _Loc(SingleGetter): def __call__(self, x): return float(x.loc[self._item]) def __neg__(self): return _NegLoc(*self._item) class _NegLoc(SingleGetter): def __call__(self, x): return -float(x.loc[self._item]) def __neg__(self): return _Loc(*self._item) class _Loc_Sum(SingleGetter): def __call__(self, x): return np.sum(x.loc[self._item]) class _Loc_Mean(SingleGetter): def __call__(self, x): return np.nanmean(x.loc[self._item]) class __LocMaker: def __getitem__(self, item): return _Loc(*item) class __LocSumMaker: def __getitem__(self, item): return _Loc_Sum(*item) class __LocMeanMaker: def __getitem__(self, item): return _Loc_Mean(*item) loc = __LocMaker() loc_sum = __LocSumMaker() loc_mean = __LocMeanMaker() class _Iloc(SingleGetter): def __call__(self, x): return float(x.iloc[self._item]) class _Iloc_Sum(SingleGetter): def __call__(self, x): return np.sum(x.iloc[self._item]) class _Iloc_Mean(SingleGetter): def __call__(self, x): return np.nanmean(x.iloc[self._item]) class __IlocMaker: def __getitem__(self, item): return _Iloc(*item) class __IlocSumMaker: def __getitem__(self, item): return _Iloc_Sum(*item) class __IlocMeanMaker: def __getitem__(self, item): return _Iloc_Mean(*item) iloc = __IlocMaker() iloc_sum = __IlocSumMaker() iloc_mean = __IlocMeanMaker()
[docs]class MappingParser(TableParser): """ A tool to parse performance measure from an arbitrary mapping format. This object provides a way to systematically extract values from an output file that has a well defined name and format that defines some kind of mapping (i.e. like a Python dict). This is exactly what we would expect for a files-based core model, which when run (and post-processed, if applicable) will generate one or more named and regularly formatted output files. Args: filename (str): The name of the file in which the mapping data is stored. The filename is a relative path to the file, and will be evaluated relative to the `from_dir` argument in the `read` method. Generally the `from_dir` will be a directory containing a set of model output files from a single model run, and this `filename` will be just the name of the file, unless the core model run constructs a sub-directory hierarchy within the output directory (this is unusual). measure_getters (Mapping[str, Getter]): A mapping that relates scalar performance measure values to Getters that extract values from the mapping data. reader_method (Callable, default pandas.read_csv): A function that accepts one positional argument (the filename to be read) and optionally some keyword arguments, and returns a Python mapping (i.e. a dict, or something that acts like a dict). handle_errors (str, default 'raise'): How to handle errors when reading a file, one of {'raise', 'nan'} **kwargs (Mapping, optional): A set of fixed keyword arguments that will be passed to `reader_method` each time it is called. """ def __init__( self, filename, measure_getters, reader_method = None, handle_errors = 'raise', **kwargs, ): if reader_method is None: import yaml def safeload(f, **kw): with open(f, 'rt') as fi: return yaml.safe_load(fi, **kw) reader_method = safeload super().__init__(filename, measure_getters, reader_method, handle_errors, **kwargs)
[docs] def raw(self, from_dir): """ Read the raw mapping data. This method will read the raw file, using the `reader_method` defined for this `MappingParser` and any designated keyword arguments for that reader, but it will not actually run any of the `measure_getters` that convert the table into individual performance measures. This method is exposed for users primarily to test be able to conveniently test `MappingParser` objects during development. Args: from_dir (Path-like): The base directory from which to read the data. Returns: Mapping """ f = os.path.join(from_dir, self.filename) if not os.path.exists(f): raise FileNotFoundError(f) return self.reader_method( f, **self.reader_kwargs, )
[docs] def read(self, from_dir): """ Read the performance measures. Args: from_dir (Path-like): The base directory from which to read the data. Returns: Dict: The measures read from this file. """ data = self.raw(from_dir) result = {} for measure_name, getter in self.measure_getters.items(): try: result[measure_name] = getter(data) except: if self.handle_errors == 'nan': _logger.exception(f"Error in reading {os.path.join(from_dir, self.filename)}") result[measure_name] = np.nan else: _logger.error(f"Error in reading mapping {os.path.join(from_dir, self.filename)}") _logger.error(f" data is: {data}") raise return result
@property def measure_names(self): """ List: the measure names contained in this TableParser. """ return sorted(self.measure_getters.keys())
class _Key(SingleGetter): def __call__(self, x): return float(x[self._item[0]]) class __KeyMaker: def __getitem__(self, item): return _Key(item) key = __KeyMaker()