Source code for wrfhydropy.core.outputdiffs

import io
import shlex
import subprocess
import warnings
import pandas as pd
import pathlib

from ..util.xrcmp import xrcmp
from .simulation import SimulationOutput



[docs]
def compare_ncfiles(
    candidate_files: list,
    reference_files: list,
    stats_only: bool = False,
    nccmp_options: list = None,
    exclude_vars: list = None,
    exclude_atts: list = None,
    xrcmp_n_cores: int = 0
):
    """Compare lists of netcdf restart files element-wise. Files must have common names
    Args:
        candidate_files: List of candidate netcdf file paths
        reference_files: List of reference netcdf file paths
        stats_only: Only return statistics on differences in data values
        nccmp_options: List of long-form command line options passed to nccmp,
        see http://nccmp.sourceforge.net/ for options. Defaults are '--metadata', '--force'
        exclude_vars: A list of strings containing variables names to
        exclude from the comparison.
        exclude_atts: A list of strings containing attribute names to exclude from the
        comparison. Defaults are 'valid_min'
    Returns:
        A named list of either pandas dataframes if possible or subprocess objects
    """

    if nccmp_options is None:
        nccmp_options = ['--data', '--metadata', '--force']

    if len(candidate_files) != len(reference_files):
        raise ValueError('Length of candidate files does not match len of reference files')

    file_list = zip(candidate_files, reference_files)

    output_list = []
    for files in file_list:
        file_candidate = pathlib.Path(files[0])
        file_reference = pathlib.Path(files[1])

        if xrcmp_n_cores > 1 and '--metadata' not in nccmp_options:
            cmp_func = _compare_nc_xrcmp
        else:
            cmp_func = _compare_nc_nccmp

        nccmp_out = cmp_func(
            candidate_nc=str(file_candidate),
            reference_nc=str(file_reference),
            stats_only=stats_only,
            nccmp_options=nccmp_options,
            exclude_vars=exclude_vars,
            exclude_atts=exclude_atts,
            n_cores=xrcmp_n_cores
        )
        output_list.append(nccmp_out)
    return output_list




[docs]
class OutputDataDiffs(object):
    def __init__(
        self,
        candidate_output: SimulationOutput,
        reference_output: SimulationOutput,
        nccmp_options: list = None,
        exclude_vars: list = None,
        exclude_atts: list = None,
        xrcmp_n_cores: int = 0
    ):
        """Calculate Diffs between SimulationOutput objects from two WrfHydroSim objects
        Args:
            candidate_output: The candidate SimulationOutput object
            reference_output: The reference SimulationOutput object
            nccmp_options: List of long-form command line options passed to nccmp,
            see http://nccmp.sourceforge.net/ for options. Defaults are '--data', '--force'
            exclude_vars: A list of strings containing variables names to
            exclude from the comparison.
            exclude_atts: A list of strings containing attribute names to exclude from the
            comparison. Defaults are 'valid_min'
        Returns:
            An OutputDiffs object
        """

        # Set default arguments
        if nccmp_options is None:
            nccmp_options = ['--data', '--force']

        if exclude_atts is None:
            exclude_atts = ['valid_min']

        # Instantiate all attributes
        self.diff_counts = dict()
        """dict: Counts of diffs by restart type"""

        self.channel_rt = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.channel_rt_grid = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.chanobs = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.lakeout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.gwout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.rtout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.ldasout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.restart_hydro = list()
        """list: List of pandas dataframes if possible or subprocess objects containing hydro
        restart file diffs"""
        self.restart_lsm = list()
        """list: List of pandas dataframes if possible or subprocess objects containing lsm restart
        file diffs"""
        self.restart_nudging = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""

        # Create list of attributes to diff
        atts_list = ['channel_rt', 'channel_rt_grid', 'chanobs', 'lakeout', 'gwout', 'rtout',
                     'ldasout', 'restart_hydro', 'restart_lsm', 'restart_nudging']
        for att in atts_list:
            candidate_att = getattr(candidate_output, att)
            reference_att = getattr(reference_output, att)

            if candidate_att is not None and reference_att is not None:
                # Check that files exist in both directories
                candidate_files = candidate_att
                reference_files = reference_att

                valid_files = _check_file_lists(candidate_files, reference_files)

                setattr(
                    self,
                    att,
                    compare_ncfiles(
                        candidate_files=valid_files[0],
                        reference_files=valid_files[1],
                        stats_only=True,
                        nccmp_options=nccmp_options,
                        exclude_vars=exclude_vars,
                        exclude_atts=exclude_atts,
                        xrcmp_n_cores=xrcmp_n_cores
                    )
                )
                diff_counts = sum(1 for _ in filter(None.__ne__, getattr(self, att)))
                self.diff_counts.update({att: diff_counts})




[docs]
class OutputMetaDataDiffs(object):
    def __init__(
        self,
        candidate_output: SimulationOutput,
        reference_output: SimulationOutput,
        stats_only=False,
        nccmp_options: list = None,
        exclude_vars: list = None,
        exclude_atts: list = None,
        xrcmp_n_cores: int = 0
    ):
        """Calculate Diffs between SimulationOutput objects from two WrfHydroSim objects
        Args:
            candidate_output: The candidate SimulationOutput object
            reference_output: The reference SimulationOutput object
            nccmp_options: List of long-form command line options passed to nccmp,
            see http://nccmp.sourceforge.net/ for options. Defaults are '--metadata', '--force'
            exclude_vars: A list of strings containing variables names to
            exclude from the comparison.
            exclude_atts: A list of strings containing attribute names to exclude from the
            comparison. Defaults are 'valid_min'
        Returns:
            An OutputDiffs object
        """
        # Set default arguments
        if nccmp_options is None:
            nccmp_options = ['--metadata', '--force']

        if exclude_atts is None:
            exclude_atts = ['valid_min']

        # Instantiate all attributes
        self.diff_counts = dict()
        """dict: Counts of diffs by restart type"""

        self.channel_rt = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.chanobs = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.lakeout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.gwout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.rtout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.ldasout = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""
        self.restart_hydro = list()
        """list: List of pandas dataframes if possible or subprocess objects containing hydro
        restart file diffs"""
        self.restart_lsm = list()
        """list: List of pandas dataframes if possible or subprocess objects containing lsm restart
        file diffs"""
        self.restart_nudging = list()
        """list: List of pandas dataframes if possible or subprocess objects containing nudging
        restart file diffs"""

        # Create list of attributes to diff
        atts_list = ['channel_rt', 'chanobs', 'lakeout', 'gwout', 'rtout', 'ldasout',
                     'restart_hydro', 'restart_lsm', 'restart_nudging']
        for att in atts_list:
            candidate_att = getattr(candidate_output, att)
            reference_att = getattr(reference_output, att)

            if candidate_att is not None and reference_att is not None:
                # Check that files exist in both directories
                candidate_files = candidate_att
                reference_files = reference_att

                valid_files = _check_file_lists(candidate_files, reference_files)

                setattr(
                    self,
                    att,
                    compare_ncfiles(
                        candidate_files=valid_files[0],
                        reference_files=valid_files[1],
                        nccmp_options=nccmp_options,
                        exclude_vars=exclude_vars,
                        exclude_atts=exclude_atts,
                        xrcmp_n_cores=xrcmp_n_cores
                    )
                )
                diff_counts = sum(1 for _ in filter(None.__ne__, getattr(self, att)))
                self.diff_counts.update({att: diff_counts})



def _compare_nc_xrcmp(
    candidate_nc: str,
    reference_nc: str,
    stats_only: bool = False,
    nccmp_options: list = None,
    exclude_vars: list = None,
    exclude_atts: list = None,
    n_cores=1,
    log_file_path: str = "xrcmp.log"
):

    # Try and set files to strings
    candidate_nc = str(candidate_nc)
    reference_nc = str(reference_nc)

    if '/' not in log_file_path:
        log_file_path = pathlib.Path(candidate_nc).parent / pathlib.Path(log_file_path)

        ret = xrcmp(
            can_file=candidate_nc,
            ref_file=reference_nc,
            log_file=str(log_file_path),
            exclude_vars=exclude_vars,
            n_cores=n_cores
        )

    if ret != 0:
        if stats_only:
            try:
                # First try stdout because that is where statistics are written
                # Get stoud into stringio object

                nccmp_out = pd.read_table(log_file_path, delim_whitespace=True, header=0)
                return nccmp_out
            except Exception as e:
                warnings.warn('Problem reading xrcmp output to pandas dataframe,'
                              'returning error code: {e}')
                return ret
        else:
            return open(log_file_path, 'r').read()
    else:
        return None


def _compare_nc_nccmp(
    candidate_nc: str,
    reference_nc: str,
    stats_only: bool = False,
    nccmp_options: list = None,
    exclude_vars: list = None,
    exclude_atts: list = None,
    n_cores: int = 0
):

    """Private method to compare two netcdf files using nccmp.
    This is wrapped by compare ncfiles to applying to a list of one or more files
    Args:
        candidate_nc: The path for the candidate netcdf file
        reference_nc: The path for the reference netcdf file
        stats_only: Only return statistics on differences in data values
        nccmp_options: List of long-form command line options passed to nccmp,
        see http://nccmp.sourceforge.net/ for options. Defaults are '--metadata', '--force'
        exclude_vars: A list of strings containing variables names to
        exclude from the comparison.
        exclude_atts: A list of strings containing attribute names to exclude from the
        comparison. Defaults are 'valid_min'
    Returns:
        Either a pandas dataframe if possible or subprocess object
    """

    # Set default arguments
    if nccmp_options is None:
        nccmp_options = ['--metadata', '--force']

    # Try and set files to strings
    candidate_nc = str(candidate_nc)
    reference_nc = str(reference_nc)

    # Make string to pass to subprocess
    command_str = 'nccmp '

    command_str += ' '.join(nccmp_options)

    command_str += ' -S '

    if exclude_vars is not None:
        # Convert exclude_vars list into a comma separated string
        exclude_vars = ','.join(exclude_vars)
        command_str += '--exclude=' + exclude_vars + ' '

    if exclude_atts is not None:
        # Convert exclude_vars list into a comma separated string
        exclude_atts = ','.join(exclude_atts)
        command_str += '--Attribute=' + exclude_atts + ' '

    command_str += candidate_nc + ' '
    command_str += reference_nc

    # Run the subprocess to call nccmp
    proc = subprocess.run(
        shlex.split(command_str),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    # Check return code
    if proc.returncode != 0:
        if stats_only:
            try:
                # First try stdout because that is where statistics are written
                # Get stoud into stringio object
                output = io.StringIO()
                output.write(proc.stdout.decode('utf-8'))
                output.seek(0)

                nccmp_out = pd.read_table(output, delim_whitespace=True, header=0)
                return nccmp_out
            except Exception as e:
                warnings.warn('Problem reading nccmp output to pandas dataframe,'
                              'returning as subprocess object: {e}')
                return proc
        else:
            return proc.stderr.decode('utf-8') + proc.stdout.decode('utf-8')
    else:
        return None


def _check_file_lists(candidate_files: list, reference_files: list) -> tuple:
    """Function to check two lists of pathlib.Paths for commonly occuring files between the two
    Args:
        candidate_files: The candidate file list
        reference_files: The reference file list
    Returns:
        A tuple of lists sorted by file name of common files
    """
    candidate_names = [file.name for file in candidate_files]
    reference_names = [file.name for file in reference_files]

    # Get only files occurring in both lists
    matching_files = list(set(candidate_names).intersection(reference_names))

    # Print warning about missing files
    missing_ref_files = [file.name for file in candidate_files if file.name not in matching_files]
    missing_can_files = [file.name for file in reference_files if file.name not in matching_files]
    if len(missing_ref_files) > 0:
        if len(missing_ref_files) == 1:
            miss_file_str = str(missing_ref_files[0])
        else:
            miss_file_str = ', '.join(missing_ref_files)
        warnings.warn(
            'The following reference files were not found in the candidate: ' + miss_file_str)

    if len(missing_can_files) > 0:
        if len(missing_can_files) == 1:
            miss_file_str = str(missing_can_files[0])
        else:
            miss_file_str = ', '.join(missing_can_files)
        warnings.warn(
            'The following candidate files were not found in the reference: ' + miss_file_str)

    # Subset lists to only those files that occur in both
    valid_ref_files = [file for file in candidate_files if file.name in matching_files]
    valid_can_files = [file for file in reference_files if file.name in matching_files]

    # Sort files by name
    valid_ref_files.sort(key=lambda x: x.name)
    valid_can_files.sort(key=lambda x: x.name)

    return valid_can_files, valid_ref_files



[docs]
def check_unprocessed_diffs(unexpected_diffs: list):
    """Function to the unexpected_diffs of DeepDiff
    Args:
        unexpected_diffs: The unexpected diffs list
    Returns:
        None
    """
    for diff_str in unexpected_diffs:
        diff_key = diff_str.split(':', 1)[0].strip()
        diff = diff_str.split(':', 1)[1].strip()
        diff_parts = diff.split(' and ')  # .strip()
        if (diff_parts[0] != diff_parts[1]):
            # commenting out this section for now to check if CI tests pass
            if ('_compose_dir' in diff_key):
                warnings.warn(UserWarning("deepdiff _compose_dirs were different:",
                                          diff_key, ":", diff_parts[0], " != ", diff_parts[1]))
                return

            # if difference isn't in _compose_dir raise error
            raise ValueError(
                'Unexpected attribute differences in unexpected members from DeepDiff output:',
                diff_key, ":", diff_parts[0], " != ", diff_parts[1])