Source code for upxo.repqual.mcgs2d_representativeness_assesser

from .._sup import dataTypeHandlers as dth
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import entropy
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
from scipy.stats import shapiro
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.spatial.distance import jensenshannon


[docs]
class mc2repr():
    """
    Representativeness qualificartion
    ----------------------------------
    target_type: str
        Source of targer data. Options:
            1. ebsd0 - un-processed 2D EBSD map: DefDAP object.
            2. ebsd1 - processed DefDAP data. Remapped with avg. ori.
            3. umc2 - UPXO Monte-Carlo Grain structure 2D.
            4. umc3 - UPXO Monte-Carlo Grain structure 3D.
            5. uvt2 - UPXO Voronoi-Tessellation Grain Structure 2D.
            6. stats - Data samples across grain morphology par. Needs xori.
                       Could be in the form of dictionary or panadas dataframe.
                       If dict or pandas dataframe, key or column name
                       respectively, must be name of the parameter.
                       Examples of parameter names include:
                           1. area, perimeter
                           2. aspecrt ratio, morphologhical orientation
    ----------------------------------
    target: object
        Target grain structure data. Details:
            1. `MCGS.gs[tslice]` for umc2 and umc3
            2. `VTGS` for uvt2
            3. ddap_ebsd - for un-processed or processed DefDAP data
    ----------------------------------
    samples: dict
        Samples to match against the target.
        Keys should be sample_names
        Values should contain either:
            grain structure objects, or
            flag-string, 'make'
        If a value is a grain strucutre object, then it will be used as
        samples. It can be of types (a) umc2, (b) umc3 and (c) uvt2
        If a value is 'make', then the following will be performanceormed:
            1. read the excel file for grain structure generation parameters
            2. simulate the grain structure evolution
            3. Pull out specified slices at specified temporal slice intervals
            4. Characterize the temporal slices
    ----------------------------------
    par_bounds: dict
        DESCRIPTION:
            For each parameter in the key, value must be a list of:
                [match bounds for peak locations in percentage,
                 match bounds for peak location density in percentage,
                 J-S test bounds
                 ]
        KEYS:
            area, perimeter, aspect ratio
        VALUES:
            bounds: [ [5, 5], [5, 5], [0.1, 0.1]]
    ----------------------------------
    metrics: list
        DESCRIPTION:
            List of metrics to use to enable representativeness qualification
            Examples include:
                1. modes_n
                2. modes_loc
                3. modes_width
                4. distr_type
                5. skewness
                6. kurtosis
    ----------------------------------
    kde_options: dict
        DESCRIPTION:
            key: bw_method
            value: choose from 'scott', 'silverman' or a scalar value
    ----------------------------------
    """
    __slots__ = ('target_type',
                 'target',
                 'samples',
                 'par_bounds',
                 'metrics',
                 'kde_options',
                 'stat_tests',
                 'test_threshold',
                 'stest',
                 'test_metrics',
                 'parameters',
                 'distr_type',
                 'performance'
                 )

    def __init__(self,
                 target_type=None,
                 target=None,
                 samples=None,
                 par_bounds=None,
                 metrics=None,
                 kde_options=None,
                 stest={'tests': ['correlation',
                                  'kldiv',
                                  'ks',
                                  'jsdiv',
                                  'mannwhitneyu',
                                  'kruskalwallis',
                                  ],
                        'mw_p_threshold': 0.90,
                        'kw_p_threshold': 0.90,
                        'ks_p_threshold': 0.90,
                        },
                 test_metrics=['mode0_location',
                               'mode0_count',
                               'mode1_location',
                               'mode1_count',
                               'mean',
                               ],
                 parameters=['area',
                             ],
                 ):
        """
        This is a core UPXO class and has the following functions:

            * Caclulate type of statistical distribution of the specified
              morphological properties of the target grain structure
              and sample grain structures.

            * Estimate statistical similarity between the target grain
              structure and each of the "samples" grain structures

            * Provide an acceptance flag for each samples grain structures
        """
        self.target_type = target_type
        self.target = target
        self.samples = samples
        self.par_bounds = par_bounds
        self.metrics = metrics
        self.kde_options = kde_options
        self.stest = stest
        self.test_metrics = test_metrics
        self.parameters = parameters
        self.performance = {}
        # from scipy.stats import gaussian_kde


[docs]
    def load_target(self,
                    target=None,
                    target_type=None):
        """Load or import target."""
        self.target = target
        self.target_type = target_type



[docs]
    def load_samples(self,
                     samples=None):
        """Load or import samples."""
        if type(samples) in dth.dt.ITERABLES:
            self.samples = samples
        else:
            print('samples must be of the type list.')



[docs]
    def add_sample(self,
                   sample=None):
        """Add or insert sample."""
        if sample:
            self.samples.append(sample)



[docs]
    def set_stests(self,
                   tests):
        """Set or update stests."""
        self.stest['tests'] = tests



[docs]
    def set_cor_thresh(self,
                       cor_threshold):
        """Set or update cor thresh."""
        while cor_threshold < 0 or cor_threshold > 1:
            self.stest['cor_threshold'] = float(input("cor_threshold [0, 1]: "))



[docs]
    def set_kldiv_thresh(self,
                         kldiv_thresh):
        """Set or update kldiv thresh."""
        while kldiv_thresh < 0 or kldiv_thresh > 1:
            self.stest['kldiv_thresh'] = float(input("kldiv_thresh [0, 1]: "))



[docs]
    def set_ks_thresh(self,
                      ks_thresh_D,
                      ks_thresh_P):
        """Set or update ks thresh."""
        while ks_thresh_D < 0 or ks_thresh_D > 1:
            self.stest['ks_thresh_D'] = float(input("ks_thresh_D [0, 1]: "))
        while ks_thresh_P < 0 or ks_thresh_P > 1:
            self.stest['ks_thresh_P'] = float(input("ks_thresh_P [0, 1]: "))



[docs]
    def set_jsdiv_thresh(self,
                         jsdiv_thresh):
        """Set or update jsdiv thresh."""
        while jsdiv_thresh < 0 or jsdiv_thresh > 1:
            self.stest['jsdiv_thresh'] = float(input("jsdiv_thresh [0, 1]: "))



[docs]
    def prop_to_excel(self,
                      filename="pxtal_properties",
                      ):
        """Prop to excel."""
        with pd.ExcelWriter(f"{filename}.xlsx") as writer:
            self.target.prop.to_excel(writer,
                                      sheet_name='target',
                                      index=False)
            for i, sample in enumerate(self.samples.values(), start=1):
                sample.prop.to_excel(writer,
                                     sheet_name=f"sample{i}",
                                     index=False
                                     )



[docs]
    def build_distribution_dataset(self):
        """Build and return  distribution dataset."""
        self.distr_type = {'target': {}}
        for sample_name in self.samples.keys():
            self.distr_type[sample_name] = {}
        for key in self.distr_type.keys():
            for parameter in self.parameters:
                self.distr_type[key][parameter] = {'right_skewed': None,
                                                   'left_skewed': None,
                                                   'leptokurtic': None,
                                                   'platykurtic': None,
                                                   'normal': None,
                                                   'kurtosis': None,
                                                   'skewness': None
                                                   }



[docs]
    def determine_distr_type(self):
        """Determine distr type."""
        self.build_distribution_dataset()
        for parameter_name in self.parameters:
            target_skewness = skew(self.target.prop[parameter_name])
            target_kurt = kurtosis(self.target.prop[parameter_name])
            shapiro_stat, shapiro_p = shapiro(self.target.prop[parameter_name])
            self.distr_type['target'][parameter_name]['skewness'] = target_skewness
            self.distr_type['target'][parameter_name]['kurtosis'] = target_kurt
            if target_skewness > 0:
                self.distr_type['target'][parameter_name]['right_skewed'] = True
                if target_kurt > 0:
                    self.distr_type['target'][parameter_name]['leptokurtic'] = True
                else:
                    self.distr_type['target'][parameter_name]['platykurtic'] = True
            else:
                self.distr_type['target'][parameter_name]['left_skewed'] = True
                if target_kurt > 0:
                    self.distr_type['target'][parameter_name]['leptokurtic'] = True
                else:
                    self.distr_type['target'][parameter_name]['platykurtic'] = True
            if abs(target_skewness) < 0.5 and abs(target_kurt) < 1 and shapiro_p > 0.05:
                self.distr_type['target'][parameter_name]['normal'] = True
            else:
                self.distr_type['target'][parameter_name]['normal'] = False

        for sample_name, sample in self.samples.items():
            for parameter_name in self.parameters:
                sample_skewness = skew(sample.prop[parameter_name])
                sample_kurt = kurtosis(sample.prop[parameter_name])
                stat, p = shapiro(sample.prop[parameter_name])
                self.distr_type[sample_name][parameter_name]['skewness'] = target_skewness
                self.distr_type[sample_name][parameter_name]['kurtosis'] = target_kurt
                if sample_skewness > 0:
                    self.distr_type[sample_name][parameter_name]['right_skewed'] = True
                    if sample_kurt > 0:
                        self.distr_type[sample_name][parameter_name]['leptokurtic'] = True
                    else:
                        self.distr_type[sample_name][parameter_name]['platykurtic'] = True
                else:
                    self.distr_type[sample_name][parameter_name]['left_skewed'] = True
                    if sample_kurt > 0:
                        self.distr_type[sample_name][parameter_name]['leptokurtic'] = True
                    else:
                        self.distr_type[sample_name][parameter_name]['platykurtic'] = True
                if abs(sample_skewness) < 0.5 and abs(sample_kurt) < 1 and shapiro_p > 0.05:
                        self.distr_type[sample_name][parameter_name]['normal'] = True
                else:
                    self.distr_type[sample_name][parameter_name]['normal'] = False



[docs]
    def test(self):
        """
        TEST 1: correlation: For two datasets, it is a measure of the linear
        relationship between them. If correlation is close to 1 then, the
        distributions are very similar.

        TEST 2: kldiv:

        TEST 3: ks: Kolmogorov-Smirnov test: Determines of the two distribution
        samples differ significantly. It uses cumulative distributions of the
        two datasets. Retyurns D-statistic and P-value.
            * D-statistic: maximum absolute difference of the cumulative
            distributions (absolute max distance (supremum) b/w the CDFs
            of the two samples). A smaller D-static value is indicative of
            similar distributions.
            * P-value: probability that thwe tywo distributions are similar. If
            p-value is low (<= 0.05), distributions are different. If p-value
            is high (> 0.05), we cannot reject the null-hypothesis that the
            two distributions are the same.
            * Note: if P <= 0.05: the null hypothesis that the two samples are
            drawn from tyhe sample sample can be rejected, indicating that the
            samples are not representative of the target

        TEST 4: jsdiv: P value will allways be between 0 and 1.
        @ 0: Distributions are identical. @ 1: Distributions are completely
        different

        TEST 5: mannwhitneyu: Mann-Whitney test: Used to determine if two '
        distribution samples are drawn from a population having the same
        population. If P-value is less than or equal to 0.05, then different
        distributiopns. If P-value is > 0.05, then the two disrtirbutions
        are similar.

        TEST 6: kruskalwallis: Kruskal-wallis test. Used to determine if there
        are statistically significant differences between two distributions.
        """
        if 'kldiv' in self.stest['tests']:
            from scipy.stats import entropy
        if 'jsdiv' in self.stest['tests']:
            from scipy.spatial.distance import jensenshannon
        if 'ks' in self.stest['tests']:
            from scipy.stats import ks_2samp
        if 'mannwhitneyu' in self.stest['tests']:
            from scipy.stats import mannwhitneyu
        if 'kruskalwallis' in self.stest['tests']:
            from scipy.stats import kruskal
        if self.stest['tests']:
            # Iterate through each of the sample object
            for sample_name, sample in self.samples.items():
                print('-----------sample-----------')
                self.performance[sample_name] = {}
                for ipar, par in enumerate(self.parameters, start=1):
                    self.performance[sample_name][par] = {}
                    for test in self.stest['tests']:
                        self.performance[sample_name][par][test] = None
                        if test == 'correlation':
                            correlation = self.target.prop[par].corr(sample.prop[par])
                            self.performance[sample_name][par][test] = correlation
                        # -------------------------------------
                        if test == 'kldiv':
                            print('kldiv test not available')
                        # -------------------------------------
                        if test == 'ks':
                            ks_D, ks_P = ks_2samp(self.target.prop[par],
                                                  sample.prop[par])
                            self.performance[sample_name][par][test] = (ks_D,
                                                                         ks_P)
                        # -------------------------------------
                        if test == 'jsdiv':
                            # TODO: DEBUG the length mismatch
                            # SOLn: Make KDE and resample data iteratively
                            # based on user satisfaction of number of bins in
                            # histogram and bandwidth in KDE calculation
                            pass
                            #js_P = jensenshannon(self.target.prop[par],
                            #                     sample.prop[par])
                            #self.performance[sample_name][par][test] = js_P
                        # -------------------------------------
                        if test == 'mannwhitneyu':
                            mwu_D, mwu_P = mannwhitneyu(self.target.prop[par].dropna(),
                                                        sample.prop[par].dropna())
                            self.performance[sample_name][par][test] = (mwu_D,
                                                                        mwu_P)
                        # -------------------------------------
                        if test == 'kruskalwallis':
                            kw_D, kw_P = kruskal(self.target.prop[par].dropna(),
                                                 sample.prop[par].dropna())
                            self.performance[sample_name][par][test] = (kw_D,
                                                                        kw_P)


                        # -------------------------------------