Source code for upxo.repqual.mcgs2d_representativeness_assesser

from .._sup import dataTypeHandlers as dth
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import entropy
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
from scipy.stats import shapiro
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.spatial.distance import jensenshannon

[docs] class mc2repr(): """ Representativeness qualificartion ---------------------------------- target_type: str Source of targer data. Options: 1. ebsd0 - un-processed 2D EBSD map: DefDAP object. 2. ebsd1 - processed DefDAP data. Remapped with avg. ori. 3. umc2 - UPXO Monte-Carlo Grain structure 2D. 4. umc3 - UPXO Monte-Carlo Grain structure 3D. 5. uvt2 - UPXO Voronoi-Tessellation Grain Structure 2D. 6. stats - Data samples across grain morphology par. Needs xori. Could be in the form of dictionary or panadas dataframe. If dict or pandas dataframe, key or column name respectively, must be name of the parameter. Examples of parameter names include: 1. area, perimeter 2. aspecrt ratio, morphologhical orientation ---------------------------------- target: object Target grain structure data. Details: 1. `MCGS.gs[tslice]` for umc2 and umc3 2. `VTGS` for uvt2 3. ddap_ebsd - for un-processed or processed DefDAP data ---------------------------------- samples: dict Samples to match against the target. Keys should be sample_names Values should contain either: grain structure objects, or flag-string, 'make' If a value is a grain strucutre object, then it will be used as samples. It can be of types (a) umc2, (b) umc3 and (c) uvt2 If a value is 'make', then the following will be performanceormed: 1. read the excel file for grain structure generation parameters 2. simulate the grain structure evolution 3. Pull out specified slices at specified temporal slice intervals 4. Characterize the temporal slices ---------------------------------- par_bounds: dict DESCRIPTION: For each parameter in the key, value must be a list of: [match bounds for peak locations in percentage, match bounds for peak location density in percentage, J-S test bounds ] KEYS: area, perimeter, aspect ratio VALUES: bounds: [ [5, 5], [5, 5], [0.1, 0.1]] ---------------------------------- metrics: list DESCRIPTION: List of metrics to use to enable representativeness qualification Examples include: 1. modes_n 2. modes_loc 3. modes_width 4. distr_type 5. skewness 6. kurtosis ---------------------------------- kde_options: dict DESCRIPTION: key: bw_method value: choose from 'scott', 'silverman' or a scalar value ---------------------------------- """ __slots__ = ('target_type', 'target', 'samples', 'par_bounds', 'metrics', 'kde_options', 'stat_tests', 'test_threshold', 'stest', 'test_metrics', 'parameters', 'distr_type', 'performance' ) def __init__(self, target_type=None, target=None, samples=None, par_bounds=None, metrics=None, kde_options=None, stest={'tests': ['correlation', 'kldiv', 'ks', 'jsdiv', 'mannwhitneyu', 'kruskalwallis', ], 'mw_p_threshold': 0.90, 'kw_p_threshold': 0.90, 'ks_p_threshold': 0.90, }, test_metrics=['mode0_location', 'mode0_count', 'mode1_location', 'mode1_count', 'mean', ], parameters=['area', ], ): """ This is a core UPXO class and has the following functions: * Caclulate type of statistical distribution of the specified morphological properties of the target grain structure and sample grain structures. * Estimate statistical similarity between the target grain structure and each of the "samples" grain structures * Provide an acceptance flag for each samples grain structures """ self.target_type = target_type self.target = target self.samples = samples self.par_bounds = par_bounds self.metrics = metrics self.kde_options = kde_options self.stest = stest self.test_metrics = test_metrics self.parameters = parameters self.performance = {} # from scipy.stats import gaussian_kde
[docs] def load_target(self, target=None, target_type=None): """Load or import target.""" self.target = target self.target_type = target_type
[docs] def load_samples(self, samples=None): """Load or import samples.""" if type(samples) in dth.dt.ITERABLES: self.samples = samples else: print('samples must be of the type list.')
[docs] def add_sample(self, sample=None): """Add or insert sample.""" if sample: self.samples.append(sample)
[docs] def set_stests(self, tests): """Set or update stests.""" self.stest['tests'] = tests
[docs] def set_cor_thresh(self, cor_threshold): """Set or update cor thresh.""" while cor_threshold < 0 or cor_threshold > 1: self.stest['cor_threshold'] = float(input("cor_threshold [0, 1]: "))
[docs] def set_kldiv_thresh(self, kldiv_thresh): """Set or update kldiv thresh.""" while kldiv_thresh < 0 or kldiv_thresh > 1: self.stest['kldiv_thresh'] = float(input("kldiv_thresh [0, 1]: "))
[docs] def set_ks_thresh(self, ks_thresh_D, ks_thresh_P): """Set or update ks thresh.""" while ks_thresh_D < 0 or ks_thresh_D > 1: self.stest['ks_thresh_D'] = float(input("ks_thresh_D [0, 1]: ")) while ks_thresh_P < 0 or ks_thresh_P > 1: self.stest['ks_thresh_P'] = float(input("ks_thresh_P [0, 1]: "))
[docs] def set_jsdiv_thresh(self, jsdiv_thresh): """Set or update jsdiv thresh.""" while jsdiv_thresh < 0 or jsdiv_thresh > 1: self.stest['jsdiv_thresh'] = float(input("jsdiv_thresh [0, 1]: "))
[docs] def prop_to_excel(self, filename="pxtal_properties", ): """Prop to excel.""" with pd.ExcelWriter(f"{filename}.xlsx") as writer: self.target.prop.to_excel(writer, sheet_name='target', index=False) for i, sample in enumerate(self.samples.values(), start=1): sample.prop.to_excel(writer, sheet_name=f"sample{i}", index=False )
[docs] def build_distribution_dataset(self): """Build and return distribution dataset.""" self.distr_type = {'target': {}} for sample_name in self.samples.keys(): self.distr_type[sample_name] = {} for key in self.distr_type.keys(): for parameter in self.parameters: self.distr_type[key][parameter] = {'right_skewed': None, 'left_skewed': None, 'leptokurtic': None, 'platykurtic': None, 'normal': None, 'kurtosis': None, 'skewness': None }
[docs] def determine_distr_type(self): """Determine distr type.""" self.build_distribution_dataset() for parameter_name in self.parameters: target_skewness = skew(self.target.prop[parameter_name]) target_kurt = kurtosis(self.target.prop[parameter_name]) shapiro_stat, shapiro_p = shapiro(self.target.prop[parameter_name]) self.distr_type['target'][parameter_name]['skewness'] = target_skewness self.distr_type['target'][parameter_name]['kurtosis'] = target_kurt if target_skewness > 0: self.distr_type['target'][parameter_name]['right_skewed'] = True if target_kurt > 0: self.distr_type['target'][parameter_name]['leptokurtic'] = True else: self.distr_type['target'][parameter_name]['platykurtic'] = True else: self.distr_type['target'][parameter_name]['left_skewed'] = True if target_kurt > 0: self.distr_type['target'][parameter_name]['leptokurtic'] = True else: self.distr_type['target'][parameter_name]['platykurtic'] = True if abs(target_skewness) < 0.5 and abs(target_kurt) < 1 and shapiro_p > 0.05: self.distr_type['target'][parameter_name]['normal'] = True else: self.distr_type['target'][parameter_name]['normal'] = False for sample_name, sample in self.samples.items(): for parameter_name in self.parameters: sample_skewness = skew(sample.prop[parameter_name]) sample_kurt = kurtosis(sample.prop[parameter_name]) stat, p = shapiro(sample.prop[parameter_name]) self.distr_type[sample_name][parameter_name]['skewness'] = target_skewness self.distr_type[sample_name][parameter_name]['kurtosis'] = target_kurt if sample_skewness > 0: self.distr_type[sample_name][parameter_name]['right_skewed'] = True if sample_kurt > 0: self.distr_type[sample_name][parameter_name]['leptokurtic'] = True else: self.distr_type[sample_name][parameter_name]['platykurtic'] = True else: self.distr_type[sample_name][parameter_name]['left_skewed'] = True if sample_kurt > 0: self.distr_type[sample_name][parameter_name]['leptokurtic'] = True else: self.distr_type[sample_name][parameter_name]['platykurtic'] = True if abs(sample_skewness) < 0.5 and abs(sample_kurt) < 1 and shapiro_p > 0.05: self.distr_type[sample_name][parameter_name]['normal'] = True else: self.distr_type[sample_name][parameter_name]['normal'] = False
[docs] def test(self): """ TEST 1: correlation: For two datasets, it is a measure of the linear relationship between them. If correlation is close to 1 then, the distributions are very similar. TEST 2: kldiv: TEST 3: ks: Kolmogorov-Smirnov test: Determines of the two distribution samples differ significantly. It uses cumulative distributions of the two datasets. Retyurns D-statistic and P-value. * D-statistic: maximum absolute difference of the cumulative distributions (absolute max distance (supremum) b/w the CDFs of the two samples). A smaller D-static value is indicative of similar distributions. * P-value: probability that thwe tywo distributions are similar. If p-value is low (<= 0.05), distributions are different. If p-value is high (> 0.05), we cannot reject the null-hypothesis that the two distributions are the same. * Note: if P <= 0.05: the null hypothesis that the two samples are drawn from tyhe sample sample can be rejected, indicating that the samples are not representative of the target TEST 4: jsdiv: P value will allways be between 0 and 1. @ 0: Distributions are identical. @ 1: Distributions are completely different TEST 5: mannwhitneyu: Mann-Whitney test: Used to determine if two ' distribution samples are drawn from a population having the same population. If P-value is less than or equal to 0.05, then different distributiopns. If P-value is > 0.05, then the two disrtirbutions are similar. TEST 6: kruskalwallis: Kruskal-wallis test. Used to determine if there are statistically significant differences between two distributions. """ if 'kldiv' in self.stest['tests']: from scipy.stats import entropy if 'jsdiv' in self.stest['tests']: from scipy.spatial.distance import jensenshannon if 'ks' in self.stest['tests']: from scipy.stats import ks_2samp if 'mannwhitneyu' in self.stest['tests']: from scipy.stats import mannwhitneyu if 'kruskalwallis' in self.stest['tests']: from scipy.stats import kruskal if self.stest['tests']: # Iterate through each of the sample object for sample_name, sample in self.samples.items(): print('-----------sample-----------') self.performance[sample_name] = {} for ipar, par in enumerate(self.parameters, start=1): self.performance[sample_name][par] = {} for test in self.stest['tests']: self.performance[sample_name][par][test] = None if test == 'correlation': correlation = self.target.prop[par].corr(sample.prop[par]) self.performance[sample_name][par][test] = correlation # ------------------------------------- if test == 'kldiv': print('kldiv test not available') # ------------------------------------- if test == 'ks': ks_D, ks_P = ks_2samp(self.target.prop[par], sample.prop[par]) self.performance[sample_name][par][test] = (ks_D, ks_P) # ------------------------------------- if test == 'jsdiv': # TODO: DEBUG the length mismatch # SOLn: Make KDE and resample data iteratively # based on user satisfaction of number of bins in # histogram and bandwidth in KDE calculation pass #js_P = jensenshannon(self.target.prop[par], # sample.prop[par]) #self.performance[sample_name][par][test] = js_P # ------------------------------------- if test == 'mannwhitneyu': mwu_D, mwu_P = mannwhitneyu(self.target.prop[par].dropna(), sample.prop[par].dropna()) self.performance[sample_name][par][test] = (mwu_D, mwu_P) # ------------------------------------- if test == 'kruskalwallis': kw_D, kw_P = kruskal(self.target.prop[par].dropna(), sample.prop[par].dropna()) self.performance[sample_name][par][test] = (kw_D, kw_P)
# -------------------------------------