Source code for emat.model.core_files.parsers


import os
import abc
import numpy as np
import pandas as pd
from typing import Mapping

from ...util.loggers import get_module_logger
_logger = get_module_logger(__name__)

[docs]class FileParser(abc.ABC):
	"""
	A tool to parse performance measure(s) from an arbitrary file format.

	This is an abstract base class, which defines the basic API for
	file parsing objects.  Most users will want to use `TableParser`
	for reading perforamce measures from any kind of file that
	contains a table of data (including one-column, one-row, and
	one-value tables).

	Args:
		filename (str):
			The name of the file in which the measure(s) are stored.
			The filename is a relative path to the file, and will be
			evaluated relative to the `from_dir` argument in the `read`
			method.
	"""

	def __init__(
			self,
			filename,
	):
		self.filename = filename

[docs]	@abc.abstractmethod
	def read(self, from_dir):
		"""
		Read the performance measures.

		Args:
			from_dir (Path-like): The base directory from which to read the data.

		Returns:
			Dict: The measures read from this file.
		"""
		pass

	@property
	@abc.abstractmethod
	def measure_names(self):
		"""
		List: the measure names contained in this TableParser.
		"""
		pass

	def __repr__(self):
		return f'<emat.model.core_files.{self.__class__.__name__} for "{self.filename}">'

[docs]class TableParser(FileParser):
	"""
	A tool to parse performance measure from an arbitrary table format.

	This object provides a way to systematically extract values
	from an output file that has a well defined name and format.  This is
	exactly what we would expect for a files-based core model, which when
	run (and post-processed, if applicable) will generate one or more
	named and regularly formatted output files.

	Args:
		filename (str):
			The name of the file in which the tabular data is stored.
			The filename is a relative path to the file, and will be
			evaluated relative to the `from_dir` argument in the `read`
			method.  Generally the `from_dir` will be a directory
			containing a set of model output files from a single model
			run, and this `filename` will be just the name of the file,
			unless the core model run constructs a sub-directory hierarchy
			within the output directory (this is unusual).
		measure_getters (Mapping[str, Getter]): A mapping that
			relates scalar performance measure values to Getters that
			extract values from the tabular data.
		reader_method (Callable, default pandas.read_csv): A function that
			accepts one positional argument (the filename to be read) and
			optionally some keyword arguments, and returns a pandas.DataFrame.
		handle_errors (str, default 'raise'): How to handle errors when
			reading a table, one of {'raise', 'nan'}
		**kwargs (Mapping, optional): A set of fixed keyword arguments
			that will be passed to `reader_method` each time it is called.

	"""

	def __init__(
			self,
			filename,
			measure_getters,
			reader_method = pd.read_csv,
			handle_errors = 'raise',
			**kwargs,
	):
		super().__init__(filename)
		if not isinstance(measure_getters, Mapping):
			raise TypeError('measure_getters must be a mapping')
		self.measure_getters = measure_getters
		self.reader_method = reader_method
		self.reader_kwargs = kwargs
		if handle_errors not in {'raise','nan'}:
			raise ValueError("handle_errors not in {'raise', 'nan'}")
		self.handle_errors = handle_errors

[docs]	def raw(self, from_dir):
		"""
		Read the raw tabular data.

		This method will read the raw file, using the `reader_method`
		defined for this `TableParser` and any designated keyword
		arguments for that reader, but it will not actually run
		any of the `measure_getters` that convert the table into
		individual performance measures.  This method is exposed for
		users primarily to test be able to conveniently test `TableParser`
		objects during development.

		Args:
			from_dir (Path-like): The base directory from which to read the data.

		Returns:
			pandas.DataFrame
		"""
		f = os.path.join(from_dir, self.filename)
		if not os.path.exists(f):
			raise FileNotFoundError(f)
		return self.reader_method( f, **self.reader_kwargs, )

[docs]	def read(self, from_dir):
		"""
		Read the performance measures.

		Args:
			from_dir (Path-like): The base directory from which to read the data.

		Returns:
			Dict: The measures read from this file.
		"""
		data = self.raw(from_dir)
		result = {}

		for measure_name, getter in self.measure_getters.items():
			try:
				result[measure_name] = getter(data)
			except:
				if self.handle_errors == 'nan':
					_logger.exception(f"Error in reading {os.path.join(from_dir, self.filename)}")
					result[measure_name] = np.nan
				else:
					_logger.error(f"Error in reading {os.path.join(from_dir, self.filename)}")
					_logger.error(f"  table shape {data.shape}")
					_logger.error(f"  index {data.index}")
					_logger.error(f"  columns  {data.columns}")
					raise

		return result

	@property
	def measure_names(self):
		"""
		List: the measure names contained in this TableParser.
		"""
		return sorted(self.measure_getters.keys())





###


def slice_repr(x):
	if isinstance(x, slice):
		if x.start is None and x.stop is not None:
			r = f":{x.stop}"
		elif x.start is not None and x.stop is None:
			r = f"{x.start}:"
		elif x.start is not None and x.stop is not None:
			r = f"{x.start}:{x.stop}"
		else:
			r = ":"
		if x.step is not None:
			r += f":{x.step}"
		return r
	else:
		return repr(x)

def tuple_repr_with_slice(xx):
	return ",".join(slice_repr(x) for x in xx)


[docs]class Getter:
	"""
	A tool to get defined value[s] from a pandas.DataFrame.

	Use a getter by calling it with the DataFrame as the sole argument.
	"""
	def __call__(self, x):
		raise NotImplementedError

class SingleGetter(Getter):
	def __init__(self, *item):
		self._item = item
	def __repr__(self):
		clsname = self.__class__.__name__[1:].replace('Neg','-').lower()
		return f"{clsname}[{tuple_repr_with_slice(self._item)}]"
	def __add__(self, other):
		return SumOfGetter(self, other)
	def __sub__(self, other):
		return SumOfGetter(self, -other)

class SumOfGetter(Getter):
	def __init__(self, *parts):
		self._parts = list(parts)
	def __call__(self, x):
		return sum(p(x) for p in self._parts)
	def __repr__(self):
		return (" + ".join(repr(x) for x in self._parts)).replace(" + -", " - ")
	def __add__(self, other):
		return SumOfGetter(*self._parts, other)
	def __sub__(self, other):
		return SumOfGetter(*self._parts, -other)


class _Loc(SingleGetter):
	def __call__(self, x):
		return float(x.loc[self._item])
	def __neg__(self):
		return _NegLoc(*self._item)

class _NegLoc(SingleGetter):
	def __call__(self, x):
		return -float(x.loc[self._item])
	def __neg__(self):
		return _Loc(*self._item)

class _Loc_Sum(SingleGetter):
	def __call__(self, x):
		return np.sum(x.loc[self._item])

class _Loc_Mean(SingleGetter):
	def __call__(self, x):
		return np.nanmean(x.loc[self._item])


class __LocMaker:
	def __getitem__(self, item):
		return _Loc(*item)

class __LocSumMaker:
	def __getitem__(self, item):
		return _Loc_Sum(*item)

class __LocMeanMaker:
	def __getitem__(self, item):
		return _Loc_Mean(*item)


loc = __LocMaker()
loc_sum = __LocSumMaker()
loc_mean = __LocMeanMaker()


class _Iloc(SingleGetter):
	def __call__(self, x):
		return float(x.iloc[self._item])

class _Iloc_Sum(SingleGetter):
	def __call__(self, x):
		return np.sum(x.iloc[self._item])

class _Iloc_Mean(SingleGetter):
	def __call__(self, x):
		return np.nanmean(x.iloc[self._item])


class __IlocMaker:
	def __getitem__(self, item):
		return _Iloc(*item)

class __IlocSumMaker:
	def __getitem__(self, item):
		return _Iloc_Sum(*item)

class __IlocMeanMaker:
	def __getitem__(self, item):
		return _Iloc_Mean(*item)


iloc = __IlocMaker()
iloc_sum = __IlocSumMaker()
iloc_mean = __IlocMeanMaker()




[docs]class MappingParser(TableParser):
	"""
	A tool to parse performance measure from an arbitrary mapping format.

	This object provides a way to systematically extract values
	from an output file that has a well defined name and format that defines
	some kind of mapping (i.e. like a Python dict).  This is
	exactly what we would expect for a files-based core model, which when
	run (and post-processed, if applicable) will generate one or more
	named and regularly formatted output files.

	Args:
		filename (str):
			The name of the file in which the mapping data is stored.
			The filename is a relative path to the file, and will be
			evaluated relative to the `from_dir` argument in the `read`
			method.  Generally the `from_dir` will be a directory
			containing a set of model output files from a single model
			run, and this `filename` will be just the name of the file,
			unless the core model run constructs a sub-directory hierarchy
			within the output directory (this is unusual).
		measure_getters (Mapping[str, Getter]): A mapping that
			relates scalar performance measure values to Getters that
			extract values from the mapping data.
		reader_method (Callable, default pandas.read_csv): A function that
			accepts one positional argument (the filename to be read) and
			optionally some keyword arguments, and returns a Python mapping
			(i.e. a dict, or something that acts like a dict).
		handle_errors (str, default 'raise'): How to handle errors when
			reading a file, one of {'raise', 'nan'}
		**kwargs (Mapping, optional): A set of fixed keyword arguments
			that will be passed to `reader_method` each time it is called.

	"""
	def __init__(
			self,
			filename,
			measure_getters,
			reader_method = None,
			handle_errors = 'raise',
			**kwargs,
	):
		if reader_method is None:
			import yaml
			def safeload(f, **kw):
				with open(f, 'rt') as fi:
					return yaml.safe_load(fi, **kw)
			reader_method = safeload
		super().__init__(filename, measure_getters, reader_method, handle_errors, **kwargs)

[docs]	def raw(self, from_dir):
		"""
		Read the raw mapping data.

		This method will read the raw file, using the `reader_method`
		defined for this `MappingParser` and any designated keyword
		arguments for that reader, but it will not actually run
		any of the `measure_getters` that convert the table into
		individual performance measures.  This method is exposed for
		users primarily to test be able to conveniently test `MappingParser`
		objects during development.

		Args:
			from_dir (Path-like): The base directory from which to read the data.

		Returns:
			Mapping
		"""
		f = os.path.join(from_dir, self.filename)
		if not os.path.exists(f):
			raise FileNotFoundError(f)
		return self.reader_method( f, **self.reader_kwargs, )

[docs]	def read(self, from_dir):
		"""
		Read the performance measures.

		Args:
			from_dir (Path-like): The base directory from which to read the data.

		Returns:
			Dict: The measures read from this file.
		"""
		data = self.raw(from_dir)
		result = {}

		for measure_name, getter in self.measure_getters.items():
			try:
				result[measure_name] = getter(data)
			except:
				if self.handle_errors == 'nan':
					_logger.exception(f"Error in reading {os.path.join(from_dir, self.filename)}")
					result[measure_name] = np.nan
				else:
					_logger.error(f"Error in reading mapping {os.path.join(from_dir, self.filename)}")
					_logger.error(f"  data is: {data}")
					raise

		return result

	@property
	def measure_names(self):
		"""
		List: the measure names contained in this TableParser.
		"""
		return sorted(self.measure_getters.keys())


class _Key(SingleGetter):
	def __call__(self, x):
		return float(x[self._item[0]])

class __KeyMaker:
	def __getitem__(self, item):
		return _Key(item)

key = __KeyMaker()