Source code for emat.database.database

# -*- coding: utf-8 -*-
"""
Created on Tue Oct 23 10:11:05 2018

@author: mmilkovits

Abstract Base Class for data storage format

"""

import abc
import pandas as pd
from contextlib import contextmanager

[docs]class Database(abc.ABC):
    
    """ Abstract Base Class for EMAT data storage
    
    Database constains the design experiments, meta-model parameters, 
    and the core and meta-model results (performance measures)
    """

    def __init__(self, readonly=False):
        self.readonly = readonly
        self.__locked = False

    @property
    @contextmanager
    def lock(self):
        """Context manager to temporarily mark this database as locked."""
        self.__locked = True
        yield
        self.__locked = False

    @property
    def is_locked(self):
        return self.readonly or self.__locked

[docs]    def get_db_info(self):
        """
        Get a short string describing this Database

        Returns:
            str
        """
        return "no info available"

[docs]    @abc.abstractmethod
    def init_xlm(self, parameter_list, measure_list):
        """
        Initialize or extend set of experiment variables and measures

        Initialize database with universe of risk variables,
        policy variables, and performance measures. All variables and measures
        defined in scopes must be defined in this set.
        This method only needs to be run
        once after creating a new database.

        Args:
            parameter_list (List[tuple]):
                Experiment variable tuples (variable name, type)
                where variable name is a string and
                type is 'uncertainty', 'lever', or 'constant'
            measure_list (List[tuple]):
                Performance measure tuples (name, transform), where
                name is a string and transform is a defined transformation
                used in metamodeling, currently supported include {'log', None}.

        """
    
    @abc.abstractmethod
    def _write_scope(self, scope_name, sheet, scp_xl, scp_m, content):
        """
        Save the emat scope information to the database.

        Generally users should not call this function directly,
        use `store_scope` instead.

        Args:
            scope_name (str):
                The scope name, used to identify experiments,
                performance measures, and results associated with this model.
                Multiple scopes can be stored in the same database.
            sheet (str):
                Yaml file name with scope definition.
            scp_xl (List[str]):
                Scope parameter names - both uncertainties and policy levers
            scp_m (List[str]):
                Scope performance measure names
            content (Scope, optional):
                Scope object to be pickled and stored in the database.
        Raises:
            KeyError:
                If scope name already exists, the scp_vars are not
                available, or the performance measures are not initialized
                in the database.

        """

[docs]    @abc.abstractmethod
    def update_scope(self, scope):
        """
        Update the emat scope information in the database.

        Args:
            scope (Scope): scope to update
        """

[docs]    @abc.abstractmethod
    def store_scope(self, scope):
        """
        Save an emat.Scope directly to the database.

        Args:
            scope (Scope): The scope object to store.
        Raises:
            KeyError: If scope name already exists.
        """

[docs]    @abc.abstractmethod
    def read_scope(self, scope_name=None):
        """
        Load the pickled scope from the database.

        Args:
            scope_name (str, optional):
                The name of the scope to load.  If not
                given and there is only one scope stored
                in the database, that scope is loaded. If not
                given and there are multiple scopes stored in
                the database, a KeyError is raised.

        Returns:
            Scope

        Raises:
            KeyError: If a name is given but is not found in
                the database, or if no name is given but there
                is more than one scope stored.
        """

[docs]    @abc.abstractmethod
    def add_scope_meas(self, scope_name, scp_m):
        """Update the set of performance measures associated with the scope
        
        Use this function when the core model runs are complete to add
        performance measures to the scope and post-process against the 
        archived results
          
        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
            scp_m (List[str]): scope performance measures
        Raises:
            KeyError: If scope name does not exist or the 
                performance measures are not initialized in the database.
        
        """     
    
[docs]    @abc.abstractmethod
    def delete_scope(self, scope_name):
        """Delete the scope from the database
        
        Deletes the scope as well as any experiments and results associated
        with the scope
        
        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run"""  
        
[docs]    @abc.abstractmethod
    def write_experiment_parameters(
            self,
            scope_name,
            design_name,
            xl_df,
    ):
        """
        Write experiment definitions the the database.
        
        This method records values for each experiment parameter,
        for each experiment in a design of one or more experiments.
        
        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis. The scope with this name should
                already have been stored in this database.
            design_name (str):
                An experiment design name. This name should be unique
                within the named scope, and typically will include a
                reference to the design sampler, for example:
                'uni' - generated by univariate sensitivity test design
                'lhs' - generated by latin hypercube sample design
                The design_name is used primarily to load groups of
                related experiments together.
            xl_df (pandas.Dataframe):
                The columns of this DataFrame are the experiment
                parameters (i.e. policy levers, uncertainties, and
                constants), and each row is an experiment.

        Returns:
            list: the experiment id's of the newly recorded experiments

        Raises:
            UserWarning: If scope name does not exist
            TypeError: If not all scope variables are defined in the 
                exp_def
        """     

[docs]    def write_experiment_parameters_1(
            self,
            scope_name,
            design_name: str,
            *args,
            **kwargs
    ):
        """
        Write experiment definitions for a single experiment.

        This method records values for each experiment parameter,
        for a single experiment only.

        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis. The scope with this name should
                already have been stored in this database.
            design_name (str):
                An experiment design name. This name should be unique
                within the named scope, and typically will include a
                reference to the design sampler, for example:
                'uni' - generated by univariate sensitivity test design
                'lhs' - generated by latin hypercube sample design
                The design_name is used primarily to load groups of
                related experiments together.
            *args, **kwargs (Mapping[s]):
                A dictionary where the keys are experiment parameter names
                (i.e. policy levers, uncertainties, and constants), and
                values are the the parameter values for this experiment.
                Subsequent positional or keyword arguments are used to update
                the parameters.

        Returns:
            int: The experiment id of the newly recorded experiments

        Raises:
            UserWarning: If scope name does not exist
            TypeError: If not all scope variables are defined in the
                exp_def
        """
        parameters = {}
        for a in args:
            if a is not None:
                parameters.update(a)
        parameters.update(kwargs)
        xl_df = pd.DataFrame(parameters, index=[0])
        result = self.write_experiment_parameters(scope_name, design_name, xl_df)
        return result[0]


[docs]    @abc.abstractmethod
    def read_experiment_parameters(
            self,
            scope_name,
            design_name=None,
            only_pending=False,
            design=None,
            *,
            experiment_ids=None,
            ensure_dtypes=True,
    ):
        """
        Read experiment definitions from the database.
        
        Read the values for each experiment parameter per experiment.
        
        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis.
            design_name (str, optional): If given, only experiments
                associated with both the scope and the named design
                are returned, otherwise all experiments associated
                with the scope are returned.
            only_pending (bool, default False): If True, only pending
                experiments (which have no performance measure results
                stored in the database) are returned.
            design (str, optional): Deprecated.  Use design_name.
            experiment_ids (Collection, optional):
                A collection of experiment id's to load.  If given,
                both `design_name` and `only_pending` are ignored.
            ensure_dtypes (bool, default True): If True, the scope
                associated with these experiments is also read out
                of the database, and that scope file is used to
                format experimental data consistently (i.e., as
                float, integer, bool, or categorical).

        Returns:
            emat.ExperimentalDesign:
                The experiment parameters are returned in a subclass
                of a normal pandas.DataFrame, which allows attaching
                the `design_name` as meta-data to the DataFrame.

        Raises:
            ValueError: if `scope_name` is not stored in this database
        """    

[docs]    @abc.abstractmethod
    def write_experiment_measures(
            self,
            scope_name,
            source,
            m_df,
            run_ids=None,
            experiment_id=None,
    ):
        """
        Write experiment results to the database.
        
        Write the performance measure results for each experiment 
        in the scope - if the scope does not exist, nothing is recorded.

        Note that the design_name is not required to write experiment
        measures, as the individual experiments from any design are
        uniquely identified by the experiment id's.
        
        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis. The scope with this name should
                already have been stored in this database.
            source (int):
                An indicator of performance measure source. This should
                be 0 for a bona-fide run of the associated core models,
                or some non-zero metamodel_id number.
            m_df (pandas.DataFrame):
                The columns of this DataFrame are the performance
                measure names, and row indexes are the experiment id's.
            run_ids (pandas.Index, optional):
                Provide an optional index of universally unique run ids
                (UUIDs) for these results. The UUIDs can be used to help
                identify problems and organize model runs.

        Raises:
            UserWarning: If scope name does not exist        
        """         

[docs]    @abc.abstractmethod
    def read_experiment_all(
            self,
            scope_name,
            design_name=None,
            source=None,
            *,
            only_pending=False,
            only_incomplete=False,
            only_complete=False,
            only_with_measures=False,
            ensure_dtypes=True,
            with_run_ids=False,
            runs=None,
    ):
        """
        Read experiment definitions and results
        
        Read the values from each experiment variable and the 
        results for each performance measure per experiment.
        
        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis.
            design_name (str or Collection[str], optional):
                The experimental design name (a single `str`) or
                a collection of design names to read.
            source (int, optional): The source identifier of the
                experimental outcomes to load.  If not given, but
                there are only results from a single source in the
                database, those results are returned.  If there are
                results from multiple sources, an error is raised.
            only_pending (bool, default False): If True, only pending
                experiments (which have no performance measure results
                stored in the database) are returned. Experiments that
                have any results, even if only partial results, are
                excluded.
            only_incomplete (bool, default False): If True, only incomplete
                experiments (which have at least one missing performance
                measure result that is not stored in the database) are
                returned.  Only complete experiments (that have every
                performance measure populated) are excluded.
            only_complete (bool, default False): If True, only complete
                experiments (which have no missing performance measure
                results stored in the database) are returned.
            only_with_measures (bool, default False): If True, only
                experiments with at least one stored performance measure
                are returned.
            ensure_dtypes (bool, default True): If True, the scope
                associated with these experiments is also read out
                of the database, and that scope file is used to
                format experimental data consistently (i.e., as
                float, integer, bool, or categorical).
            with_run_ids (bool, default False): Whether to use a
                two-level pd.MultiIndex that includes both the
                experiment_id (which always appears in the index)
                as well as the run_id (which only appears in the
                index if this argument is set to True).
            runs ({None, 'all', 'valid', 'invalid'}, default None):
                By default, this method returns the one and only
                valid model run matching the given `design_name`
                and `source` (if any) for any experiment, and fails
                if there is more than one such valid run. Set this to
                'valid' or 'invalid' to get all valid or invalid model
                runs (instead of raising an exception). Set to 'all' to get
                everything, including both valid and invalidated results.

        Returns:
            emat.ExperimentalDesign:
                The experiment parameters are returned in a subclass
                of a normal pandas.DataFrame, which allows attaching
                the `design_name` as meta-data to the DataFrame.

        Raises:
            ValueError
                When no source is given but the database contains
                results from multiple sources.
        """

[docs]    @abc.abstractmethod
    def read_experiment_measures(
            self,
            scope_name,
            design_name=None,
            experiment_id=None,
            source=None,
            design=None,
            runs=None,
    ):
        """
        Read experiment results from the database.

        Args:
            scope_name (str or Scope):
                A scope or just its name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis.
            design_name (str, optional): If given, only experiments
                associated with both the scope and the named design
                are returned, otherwise all experiments associated
                with the scope are returned.
            experiment_id (int, optional): The id of the experiment to
                retrieve.  If omitted, get all experiments matching the
                named scope and design.
            source (int, optional): The source identifier of the
                experimental outcomes to load.  If not given, but
                there are only results from a single source in the
                database, those results are returned.  If there are
                results from multiple sources, an error is raised.
            design (str): Deprecated, use `design_name`.
            runs ({None, 'all', 'valid', 'invalid'}, default None):
                By default, this method fails if there is more than
                one valid model run matching the given `design_name`
                and `source` (if any) for any experiment.  Set this to
                'valid' or 'invalid' to get all valid or invalid model
                runs (instead of raising an exception). Set to 'all' to get
                everything, including both valid and invalidated results.
            formulas (bool, default True): If the scope includes
                formulaic measures (computed directly from other
                measures) then compute these values and include them in
                the results.

        Returns:
            results (pandas.DataFrame): performance measures

        Raises:
            ValueError
                When the database contains multiple sets of results
                matching the given `design_name` and/or `source`
                (if any) for any experiment.
        """

[docs]    @abc.abstractmethod
    def read_experiment_measure_sources(
            self,
            scope_name,
            design_name=None,
            experiment_id=None,
            design=None,
    ):
        """
        Read all source ids from the results stored in the database.

        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis.
            design_name (str, optional): If given, only experiments
                associated with both the scope and the named design
                are returned, otherwise all experiments associated
                with the scope are returned.
            experiment_id (int, optional): The id of the experiment to
                retrieve.  If omitted, get all experiments matching the
                named scope and design.
            design (str): Deprecated, use `design_name`.

        Returns:
            List[Int]: performance measure source ids
        """


[docs]    @abc.abstractmethod
    def delete_experiments(self, scope_name, design_name=None, design=None):
        """
        Delete experiment definitions and results.
        
        The method removes the linkage between experiments and the
        identified experimental design.  Experiment parameters and results
        are only removed if they are also not linked to any other experimental
        design stored in the database.
        
        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
            design_name (str): Only experiments
                associated with both the scope and the named design
                are deleted.
            design (str): Deprecated, use `design_name`.
        """


[docs]    @abc.abstractmethod
    def delete_experiment_measures(
            self,
            experiment_ids=None,
    ):
        """
        Delete experiment performance measure results.

        The method removes only the performance measures, not the
        parameters.  This can be useful if a set of corrupted model
        results was stored in the database.

        Args:
            experiment_ids (Collection, optional):
                A collection of experiment id's for which measures shall
                be deleted.  Note that no scope or design are given here,
                experiments must be individually identified.

        """


[docs]    @abc.abstractmethod
    def write_experiment_all(self, scope_name, design_name, source, xlm_df):
        """
        Write experiment definitions and results
        
        Writes the values from each experiment variable and the 
        results for each performance measure per experiment
        
        Args:
            scope_name (str):
                A scope name, used to identify experiments,
                performance measures, and results associated with this
                exploratory analysis. The scope with this name should
                already have been stored in this database.
            design_name (str):
                An experiment design name. This name should be unique
                within the named scope, and typically will include a
                reference to the design sampler, for example:
                'uni' - generated by univariate sensitivity test design
                'lhs' - generated by latin hypercube sample design
                The design_name is used primarily to load groups of
                related experiments together.
            source (int):
                An indicator of performance measure source. This should
                be 0 for a bona fide run of the associated core models,
                or some non-zero metamodel_id number.
            xlm_df (pandas.Dataframe):
                The columns of this DataFrame are the experiment
                parameters (i.e. policy levers, uncertainties, and
                constants) and performance measures, and each row
                is an experiment.

        Raises:
            DesignExistsError: If scope and design already exist
            TypeError: If not all scope variables are defined in the 
                experiment
        """     


[docs]    @abc.abstractmethod
    def read_scope_names(self, design_name=None) -> list:
        """A list of all available scopes in the database.

        Args:
            design_name (str, optional): If a design name, is given, only
                scopes containing a design with this name are returned.

        Returns:
            list
        """

[docs]    @abc.abstractmethod
    def read_design_names(self, scope_name:str) -> list:
        """A list of all available designs for a given scope.

        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
        """

[docs]    @abc.abstractmethod
    def read_experiment_id(self, scope_name, *args, **kwargs):
        """
        Read the experiment id previously defined in the database

        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
            parameters (dict): keys are experiment parameters, values are the
                experimental values to look up.  Subsequent positional or keyword
                arguments are used to update parameters.

        Returns:
            int: the experiment id of the identified experiment

        Raises:
            ValueError: If scope name does not exist
            ValueError: If multiple experiments match an experiment definition.
                This can happen, for example, if the definition is incomplete.
        """


[docs]    @abc.abstractmethod
    def read_experiment_ids(self, scope_name, xl_df):
        """
        Read the experiment ids previously defined in the database.

        This method is used to recover the experiment id, if the
        set of parameter values is known but the id of the experiment
        is not known.

        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
            xl_df (pandas.DataFrame): columns are experiment parameters,
                each row is a full experiment

        Returns:
            list: the experiment id's of the identified experiments

        Raises:
            ValueError: If scope name does not exist
            ValueError: If multiple experiments match an experiment definition.
                This can happen, for example, if the definition is incomplete.
        """


[docs]    @abc.abstractmethod
    def read_uncertainties(self, scope_name:str) -> list:
        """A list of all uncertainties for a given scope.

        Args:
            scope_name (str): scope name
        """

[docs]    @abc.abstractmethod
    def read_levers(self, scope_name:str) -> list:
        """A list of all levers for a given scope.

        Args:
            scope_name (str): scope name
        """

[docs]    @abc.abstractmethod
    def read_constants(self, scope_name:str) -> list:
        """A list of all constants for a given scope.

        Args:
            scope_name (str): scope name
        """

[docs]    @abc.abstractmethod
    def read_measures(self, scope_name:str) -> list:
        """A list of all performance measures for a given scope.

        Args:
            scope_name (str): scope name
        """

[docs]    @abc.abstractmethod
    def write_metamodel(self, scope_name, metamodel, metamodel_id=None, metamodel_name=''):
        """Store a meta-model in the database

         Args:
            scope_name (str): scope name
            metamodel (emat.MetaModel): The meta-model to be stored.
                If a PythonCoreModel containing a MetaModel is given,
                the MetaModel will be extracted.
            metamodel_id (int, optional): A unique id number for this
                metamodel.  If no id number is given and it cannot be
                inferred from `metamodel`, a unique id number
                will be created.
            metamodel_name (str, optional): A name for this meta-model.
                If no name is given and it cannot be
                inferred from `metamodel`, an empty string is used.
       """


[docs]    @abc.abstractmethod
    def read_metamodel(self, scope_name, metamodel_id=None):
        """Retrieve a meta-model from the database.

        Args:
            scope_name (str): scope name
            metamodel_id (int, optional): A unique id number for this
                metamodel.  If not given but there is exactly one
                metamodel stored for the given scope, that metamodel
                will be returned.

        Returns:
            PythonCoreModel: The meta-model, ready to use
        """

[docs]    @abc.abstractmethod
    def read_metamodel_ids(self, scope_name):
        """A list of all metamodel id's for a given scope.

        Args:
            scope_name (str): scope name
        """

[docs]    @abc.abstractmethod
    def get_new_metamodel_id(self, scope_name):
        """Get a new unused metamodel id for a given scope.

        Args:
            scope_name (str): scope name

        Returns:
            int
        """

[docs]    @abc.abstractmethod
    def read_box(self, scope_name: str, box_name: str, scope=None):
        """
        Read a Box from the database.

        Args:
            scope_name (str):
                The name of the scope from which to read the box.
            box_name (str):
                The name of the box to read.
            scope (Scope, optional):
                The Scope to assign to the Box that is returned.
                If not given, no Scope object is assigned to the
                box.

        Returns:
            Box
        """

[docs]    @abc.abstractmethod
    def read_box_names(self, scope_name: str):
        """
        Get the names of all boxes associated with a particular scope.

        Args:
            scope_name (str):
                The name of the scope from which to read the Box names.

        Returns:
            list[str]
        """

[docs]    @abc.abstractmethod
    def read_box_parent_name(self, scope_name: str, box_name:str):
        """
        Get the name of the parent box for a particular box in the database

        Args:
            scope_name (str):
                The name of the scope from which to read the Box parent.
            box_name (str):
                The name of the box from which to read the parent.

        Returns:
            str or None:
                If the identified box has a parent, this is the name of that
                parent, otherwise None is returned.

        """

[docs]    @abc.abstractmethod
    def read_box_parent_names(self, scope_name: str):
        """
        Get the name of the parent box for each box in the database.

        Args:
            scope_name (str):
                The name of the scope from which to read Box parents.

        Returns:
            dict
                A dictionary, with keys giving Box names and values
                giving the respective Box parent names.

        """

[docs]    @abc.abstractmethod
    def read_boxes(self, scope_name: str=None, scope=None):
        """
        Read Boxes from the database.

        Args:
            scope_name (str, optional):
                The name of the scope from which to load Boxes. This
                is used exclusively to identify the Boxes to load from
                the database, and the scope by this name is not attached
                to the Boxes, unless `scope` is given, in which case this
                argument is ignored.
            scope (Scope, optional):
                The scope to assign to the Boxes.  If not given,
                no Scope object is assigned.

        Returns:
            Boxes
        """

[docs]    @abc.abstractmethod
    def write_box(self, box, scope_name=None):
        """
        Write a single box to the database.

        Args:
            box (Box):
                The Box to write to the database.
            scope_name (str, optional):
                The scope name to use when writing to the database. If
                the `boxes` has a particular scope assigned, the name
                of that scope is used.

        Raises:
            ValueError:
                If the `box` has a particular scope assigned, and
                `scope_name` is given but it is not the same name
                of the assigned scope.

        """

[docs]    @abc.abstractmethod
    def write_boxes(self, boxes, scope_name=None):
        """
        Write Boxes to the database.

        Args:
            boxes (Boxes):
                The collection of Boxes to write to the database.
            scope_name (str, optional):
                The scope name to use when writing to the database. If
                the `boxes` has a particular scope assigned, the name
                of that scope is used.

        Raises:
            ValueError:
                If the `boxes` has a particular scope assigned, and
                `scope_name` is given but it is not the same name
                of the assigned scope.

        """

[docs]    @abc.abstractmethod
    def new_run_id(
            self,
            scope_name=None,
            parameters=None,
            location=None,
            experiment_id=None,
            source=0,
    ):
        """
        Create a new run_id in the database.

        Args:
            scope_name (str): scope name, used to identify experiments,
                performance measures, and results associated with this run
            parameters (dict): keys are experiment parameters, values are the
                experimental values to look up.  Subsequent positional or keyword
                arguments are used to update parameters.
            location (str or True, optional): An identifier for this location
                (i.e. this computer).  If set to True, the name of this node
                is found using the `platform` module.
            experiment_id (int, optional): The experiment id associated
                with this run.  If given, the parameters are ignored.
            source (int, default 0): The metamodel_id of the source for this
                run, or 0 for a core model run.

        Returns:
            Tuple[Int,Int]:
                The run_id and experiment_id of the identified experiment

        Raises:
            ValueError: If scope name does not exist
            ValueError: If multiple experiments match an experiment definition.
                This can happen, for example, if the definition is incomplete.
        """

[docs]    def info(self, stream=None):
        """
        Print info about scopes and designs in this database.
        """
        if stream is None:
            import sys
            stream = sys.stdout
        print(f"<emat.{self.__class__.__name__}>", file=stream)
        scope_names = self.read_scope_names()
        for scope_name in scope_names:
            print(f"scope: {scope_name}:", file=stream)
            design_names = self.read_design_names(scope_name)
            if design_names:
                print(f"  designs:", file=stream)
                for design_name in design_names:
                    print(f"    - {design_name}", file=stream)
            else:
                print(f"  no designs", file=stream)