Commit 24d396b2 authored by Leonie Pick's avatar Leonie Pick
Browse files

Implementation of changes made during the revision process.

parent ee8fc60d
......@@ -3,6 +3,7 @@ Dump/
# Don't track the following files
Input.nc
Input_development.nc
.ipynb_checkpoints/
# Don't track all files with these endings:
......
This diff is collapsed.
......@@ -16,6 +16,7 @@ mpl.use('TkAgg')
import matplotlib.pyplot as plt
plt.style.use('bmh')
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
# Scipy
from scipy import interp
from scipy.signal import butter, filtfilt
......@@ -30,4 +31,5 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import calibration_curve
......@@ -21,7 +21,7 @@ def LowpassFilter(data, cutoff, fs, order=1):
return y
###
###
def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid, Save):
def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid):
###### STEP 1 ## Threshold scaled in dependence on solar cycle phase
Index = HMC
......@@ -36,8 +36,6 @@ def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid, Save):
Index_thres1 = np.percentile(HMC,percentile)*scaling**(power)
StormIndices = np.where(np.less(Index, Index_thres1))[0]
#print(np.percentile(HMC,percentile))
########## STEP 2a ## Combine consecutive UT hours into single events
IndexMin = []
IndexDiff = np.diff(StormIndices)
......@@ -78,11 +76,7 @@ def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid, Save):
else: # difference is 1, end of interval
IndexMin.append(StormIndices[i-j]+np.argmin(Index[StormIndices[i-j:i+2]]))
# Test here how long j is and do the same as above
#print(jmax)
#print('---')
########## STEP 2b ## Make sure that single events are at least mini_range apart
IndexMin1 = IndexMin[:]
IndexDiff = np.diff(IndexMin1)
......@@ -127,12 +121,10 @@ def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid, Save):
if HMC_diff <= -thresHMC:
IndexMin2.append(i)
pl.Selection(HTime,DTime,HMC,Index_thres1,StormIndices,IndexMin,IndexMin1,IndexMin2,Save)
return IndexMin2
return Index_thres1, StormIndices, IndexMin1, IndexMin2
###
###
def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Reference, Time, Date, YearStart, Save):
def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Dst, Reference, Time, Date, YearStart, Mode, Save):
YearsIndex = np.where(Time[:,0] >= YearStart)[0]
TrClass = Reference[:,0]
......@@ -147,7 +139,7 @@ def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Reference, T
accepted_loss_CIRs = 40
#---#
## Parameters
## Parameters for the selection algorithm
percentile = np.arange(29,29.1,1) #percentile = np.arange(8,9,1)
power = np.arange(1,1.1,1) #power = np.arange(0.9,1.0,0.1)
mini_range = np.arange(26,26.1,1) #mini_range = np.arange(78,79,1)
......@@ -157,8 +149,10 @@ def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Reference, T
threshold = 0
for i in range(len(grid)):
Storms = Search_TargetEvents(HMC[YearsIndex], HMC11y[YearsIndex], HMC5d[YearsIndex], dHMC[YearsIndex],
Time[YearsIndex,:], Date[YearsIndex], grid[i,:], Save)
Index_thres, Storms_a, Storms_b, Storms = Search_TargetEvents(HMC[YearsIndex], HMC11y[YearsIndex], HMC5d[YearsIndex], dHMC[YearsIndex],
Time[YearsIndex,:], Date[YearsIndex], grid[i,:])
pl.Selection(Time[YearsIndex,:],Date[YearsIndex],HMC[YearsIndex],Index_thres,Storms_a,Storms_b,Storms,Kp_all,KpHours_all,Dst[YearsIndex],Save)
Found_CIRs = np.where(np.in1d(TrTimeIndex[np.logical_or(TrClass==0,TrClass==2)],Storms+YearsIndex[0]))[0]
Found_CMEs = np.where(np.in1d(TrTimeIndex[TrClass==1],Storms+YearsIndex[0]))[0]
......@@ -188,13 +182,13 @@ def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Reference, T
###
TargetResults = {'No. target events':len(StormsWin),'Fraction [%]':len(StormsWin)*100/len(HMC[YearsIndex])}
TrainingResults = [[len(CIRs), len(CMEs), len(CIRs)+len(CMEs)],[len(Found_CIRs),len(Found_CMEs),len(Found_CIRs)+len(Found_CMEs)]]
SelectionParams = {r'$P_n [nT]$':grid[:,0],r'$p_{sc}$':grid[:,1], r'$\Delta t$ [h]':grid[:,2], r' $Hs$ [nT]':-grid[:,3]}
SelectionParams = {r'$n$':int(grid[:,0]),r'$p_{sc}$':int(grid[:,1]), r'$\Delta t$ [h]':int(grid[:,2]+1), r' $Hs$ [nT]':-int(grid[:,3])}
display(pd.DataFrame(data=SelectionParams,index=['Selection parameters']))
display(pd.DataFrame(data=TargetResults,index=['Selection result']))
display(pd.DataFrame(data=TrainingResults,columns=['No. CIRs','No. CMEs', 'Total'], index=['Reference set', 'Training set']))
###
pl.IndexDist(Time,YearsIndex,StormsWin,Kp_all,KpHours_all,HMC,Save)
pl.IndexDist(Time,YearsIndex,StormsWin,TrTimeIndex,Kp_all,KpHours_all,HMC,Dst,Mode,Save)
return StormsWin
###
......@@ -342,8 +336,8 @@ def Get_Features(Time, Storms, Reference, HMC, HMC11y, HMC1y, dHMC, B_MLT, ASY,
dAsyDD = np.gradient(ASY[:,0],1,edge_order=1)
dHMC11y = np.gradient(HMC11y,1,edge_order=1)
TrTimeIndex = Reference[:,1]#Reference.sel(Properties='TimeIndex')
TrClass = Reference[:,0]#Reference.sel(Properties='Class')
TrTimeIndex = Reference[:,1]
TrClass = Reference[:,0]
excludeEvents = []
count1=0; count2=0
......@@ -429,7 +423,6 @@ def Get_Diagnostics(FeatureMatrix,Storms,Reference,Save):
SData = dict(zip(keys, np.zeros(n_classes)))
for i in range(n_classes):
TargetClasses[i]=np.where(TrClass[TargetKnown]==i)[0]
#FData[i] = Data[TargetClasses[i],:]
NData[i] = NData2[TargetClasses[i],:]
NData[i][:,11] = Data[TargetClasses[i],11]
MData[i] = np.nanmedian(NData[i],axis=0)
......@@ -439,7 +432,6 @@ def Get_Diagnostics(FeatureMatrix,Storms,Reference,Save):
Ranges = np.zeros((n_features,n_classes,2))
k = 0
for i in [0,3,5,9,10,6,7,8,1,4,2,11]:#range(n_features):
#vall = np.zeros(n_classes); valr = np.zeros(n_classes)
for j in range(n_classes):
vall=np.percentile(NData[j][:,i],25); valr = np.percentile(NData[j][:,i],75)#; val2l = np.percentile(NCMEs[:,i],25); val2r = np.percentile(NCMEs[:,i],75)
FeatureResults[k,j] = np.around(MData[j][i],3)
......@@ -449,7 +441,6 @@ def Get_Diagnostics(FeatureMatrix,Storms,Reference,Save):
Overlap = Ranges[k,0,1]-Ranges[k,1,0]
else:
Overlap = Ranges[k,1,1]-Ranges[k,0,0]
#print(i,np.around(valr-vall,3))
FeatureResults[k,2] = np.around(abs(MData[0][i]-MData[1][i]),3)
FeatureResults[k,3] = np.around(Overlap,3)
FeatureResults[k,4] = np.around(abs(MData[0][i]-MData[1][i])-Overlap,3)
......@@ -546,20 +537,18 @@ def Select_Features(Data,Target,N2,K2,Pipe,SelectParams,EstimateParams,scoring,s
Scores[i,:,1] = GS.cv_results_[name]
Indices[i] = GS.best_index_
#print(GS.best_score_)
#print(GS.cv_results_['params'])
# Fix number of features in estimation parameters
Scores_mean = np.mean(Scores,axis=0)
Scores_std = np.std(Scores,axis=0)
FeaturesSelect = np.argmax(Scores_mean[:,0])+1
EstimateParams[list(SelectParams.keys())[0]] = [FeaturesSelect]
## Get feature ranking
# Get feature ranking
features = np.arange(0,nFeatures,1)
X_index = np.arange(len(features)).reshape(1,-1)
#for ii in np.arange(1,nFeatures+1,1):
# for ii in np.arange(1,nFeatures+1,1):
params = GS.cv_results_['params'][0]
Pipe.set_params(**params)
Pipe.fit(Data,Target)
......@@ -572,7 +561,7 @@ def Select_Features(Data,Target,N2,K2,Pipe,SelectParams,EstimateParams,scoring,s
else:
ranking = np.argsort(transformer.ranking_)
## Plot test results
# Plot test results
ax = pl.Ranking(Scores_mean, Scores_std, Data, ranking, FeaturesSelect, estimators, scoring_name, est, ax, Save)
return EstimateParams, ax
......@@ -589,8 +578,6 @@ def Optimize_Model(Data,Target,N1,K1,model,params,nModels,scoring,scoring_name):
GS = GridSearchCV(model, params, n_jobs=1, scoring=scoring, iid='False', cv=CV1, refit=scoring_name, return_train_score=True)
GS.fit(Data,Target)
#print(pd.DataFrame(GS.cv_results_))
for name in GS.cv_results_.keys():
if name.startswith('split'):
if scoring_name in name and 'test' in name:
......@@ -600,10 +587,6 @@ def Optimize_Model(Data,Target,N1,K1,model,params,nModels,scoring,scoring_name):
jj = int(name[5])
CV1_scores[:,j,jj,1] = GS.cv_results_[name]
#if name.startswith('rank_test_'+scoring_name):
#print(GS.cv_results_[name])
#print(np.argmin(GS.cv_results_[name]))
Param_Comb = GS.cv_results_['params']
# Most frequent model (single fold count)
......@@ -646,8 +629,6 @@ def Find_Model(Data,Target,N1,K1,N2,K2,model,params,scoring,scoring_name,n_class
SumCounts = np.sum(CV2_counts,axis=1)
SumCounts = np.sum(SumCounts,axis=0)
#BestOnAll_count = np.histogram(BestOnSplit_count[:,:,0],np.arange(0,nModels+1,1)))
CV2_means = np.mean(CV2_scores,axis=1)
CV2_means = np.mean(CV2_means,axis=0)
......@@ -658,7 +639,6 @@ def Find_Model(Data,Target,N1,K1,N2,K2,model,params,scoring,scoring_name,n_class
###
ModelB = Param_Comb[BestModelTestB].copy(); ModelA = Param_Comb[BestModelTestA].copy()
#ModelB['model number'] = BestModelTestB; ModelA['model number'] = BestModelTestA
Hyperparams = np.vstack([list(ModelB.values())])#,list(ModelA.values())])
display(pd.DataFrame(data=Hyperparams,index=['Best model'],columns=list(ModelB.keys())))
###
......@@ -668,7 +648,7 @@ def Find_Model(Data,Target,N1,K1,N2,K2,model,params,scoring,scoring_name,n_class
return Param_Comb[BestModelTestB]
###
###
def Get_MyScores(CM_means, n_classes):
def Get_Scores(CM_means, n_classes):
TN = CM_means[0]; FP = CM_means[1]; FN=CM_means[2]; TP=CM_means[3]
......@@ -698,15 +678,20 @@ def Get_MyScores(CM_means, n_classes):
return C, Scores
###
###
def Assess_Model(Data,Target,N2,K2,modelTest,n_classes,Plot,Save):
def Assess_Model(Data,Target,N2,K2,modelTest,n_classes,CalibMode,Plot,Save):
## ROC, Precision-Recall curves, confusion matrix
curve_i = np.linspace(0,1,100)
Curves = np.zeros((N2,K2,2,100))
CM = np.zeros((N2,K2,n_classes**2,2))
# Thresholds for decision boundary
thresholds = np.linspace(0,1,101)
## Performance scores
Scorers = np.zeros((N2,K2,22,2))
# Confusion matrix
CM = np.zeros((N2,K2,n_classes**2,2,len(thresholds)))
# Performance scores
Scorers = np.zeros((N2,K2,22,2,len(thresholds)))
# Calibration curve
n_bins = 10
Calib = np.zeros((N2,K2,3,n_bins))*np.nan
for i in range(N2):
CV2 = StratifiedKFold(n_splits=K2, shuffle=True, random_state=i)
......@@ -725,49 +710,64 @@ def Assess_Model(Data,Target,N2,K2,modelTest,n_classes,Plot,Save):
proba_train = modelTest.predict_proba(D_train2) # train score
pred_train = modelTest.predict(D_train2)
## Precision-Recall
precision_curve0, recall_curve0, thresPR0 = metrics.precision_recall_curve(T_test2,proba_test[:,1],pos_label=1)
Scorers[i,ii,0,0] = metrics.auc(recall_curve0,precision_curve0)
precision_curve1, recall_curve1, thresPR1 = metrics.precision_recall_curve(T_train2,proba_train[:,1],pos_label=1)
Scorers[i,ii,0,1] = metrics.auc(recall_curve1,precision_curve1)
recall_curve, index_u = np.unique(recall_curve0,return_index=True); precision_curve = precision_curve0[index_u]
precision_i = interp(curve_i,recall_curve,precision_curve)
## ROC curve
fpr0,tpr0,thresROC0 = metrics.roc_curve(T_test2,proba_test[:,1],pos_label=1)
Scorers[i,ii,1,0] = metrics.auc(fpr0,tpr0)
fpr1,tpr1,thresROC1 = metrics.roc_curve(T_train2,proba_train[:,1],pos_label=1)
Scorers[i,ii,1,1] = metrics.auc(fpr1,tpr1)
tpi_i = interp(curve_i,fpr0,tpr0)
Curves[i,ii,0,:] = precision_i; Curves[i,ii,1,:] = tpi_i
## Confusion matrix
CM[i,ii,:,0] = metrics.confusion_matrix(T_test2, pred_test, labels=range(n_classes)).ravel()
CM[i,ii,:,1] = metrics.confusion_matrix(T_train2, pred_train, labels=range(n_classes)).ravel()
for j in range(len(thresholds)):
class1_test = np.where(proba_test[:,1] > thresholds[j])[0]
pred_test_thres = np.zeros(len(pred_test))
pred_test_thres[class1_test] = 1
class1_train = np.where(proba_train[:,1] > thresholds[j])[0]
pred_train_thres = np.zeros(len(pred_train))
pred_train_thres[class1_train] = 1
# Fill confusion matrix
CM[i,ii,:,0,j] = metrics.confusion_matrix(T_test2, pred_test_thres, labels=range(n_classes)).ravel()
CM[i,ii,:,1,j] = metrics.confusion_matrix(T_train2, pred_train_thres, labels=range(n_classes)).ravel()
## Get scorers
C, Scorers[i,ii,2:,0] = Get_MyScores(CM[i,ii,:,0],n_classes)
C, Scorers[i,ii,2:,1] = Get_MyScores(CM[i,ii,:,1],n_classes)
# Get performance scores
C_tmp, Scorers[i,ii,2:,0,j] = Get_Scores(CM[i,ii,:,0,j],n_classes)
C_tmp, Scorers[i,ii,2:,1,j] = Get_Scores(CM[i,ii,:,1,j],n_classes)
if n_classes > 2:
Scorers[i,ii,19,0,j] = metrics.matthews_corrcoef(T_test2,pred_test_thres)
Scorers[i,ii,19,1,j] = metrics.matthews_corrcoef(T_train2,pred_train_thres)
# Area-Under-Curve scores
Scorers[i,ii,6,0,np.isnan(Scorers[i,ii,6,0,:])] = 1; Scorers[i,ii,6,1,np.isnan(Scorers[i,ii,6,1,:])] = 1
Scorers[i,ii,0,0,:] = metrics.auc(Scorers[i,ii,4,0,:],Scorers[i,ii,2,0,:]); Scorers[i,ii,0,1,:] = metrics.auc(Scorers[i,ii,4,1,:],Scorers[i,ii,2,1,:]) #ROC
Scorers[i,ii,1,0,:] = metrics.auc(Scorers[i,ii,2,0,:],Scorers[i,ii,6,0,:]); Scorers[i,ii,1,1,:] = metrics.auc(Scorers[i,ii,2,1,:],Scorers[i,ii,6,1,:]) #P-R
# Calibration curve
if CalibMode == True:
Calib[i,ii,2,:], bin_edges = np.histogram(proba_test[:,1], bins=np.arange(0,1.1,0.1))
Prob_true, Prob_pred = calibration_curve(T_test2, proba_test[:,1], normalize=False, n_bins=n_bins)
bins_present = np.where(Calib[i,ii,2,:]!=0)[0]
Calib[i,ii,0,bins_present]=Prob_true
Calib[i,ii,1,bins_present]=Prob_pred
if n_classes > 2:
Scorers[i,ii,19,0] = metrics.matthews_corrcoef(T_test2,pred_test)
Scorers[i,ii,19,0] = metrics.matthews_corrcoef(T_train2,pred_train)
ii += 1
## Scores
Scorers_means = np.mean(Scorers,axis=1)
Model_Mean = np.around(np.mean(Scorers_means,axis=0),4)
Model_Std = np.around(np.std(Scorers_means,axis=0),4)
# Scores
Scorers_means = np.nanmean(Scorers,axis=1)
Model_Mean = np.around(np.nanmean(Scorers_means,axis=0),4)
Model_Std = np.around(np.nanstd(Scorers_means,axis=0),4)
## Sum confusion matrix
# Sum confusion matrix
CM_means_i = np.sum(CM,axis=1); CM_means = np.around(np.mean(CM_means_i,axis=0),0)
C, Scores = Get_MyScores(CM_means[:,0], n_classes)
# Plot mean confusion matrix and curves
# Decide on decision boundary
scores_mean = np.nanmean(Model_Mean[[11,15,18,19,20],0,:],axis=0)
decision_boundary_all = int(thresholds[np.nanargmax(scores_mean)]*100)
decision_boundary = 50 #decision_boundary_all
C, Scores = Get_Scores(CM_means[:,0,decision_boundary], n_classes)
# Plotting
if Plot == True:
pl.CM(CM_means[:,0],C,n_classes,Save)
pl.Curves(N2,K2,Curves,curve_i,Model_Mean[0:2,0],Model_Std[0:2,0],C,Save)
pl.CM(CM_means[:,0,decision_boundary],C,n_classes,Save)
pl.Curves(N2,K2,Scorers,decision_boundary,Model_Mean[0:2,0,:],Model_Std[0:2,0],C,Save)
pl.Decision(thresholds,Model_Mean[:,0,:],Model_Std[:,0,:],Save)
pl.Calibration(Calib,bin_edges,Save)
return Model_Mean, Model_Std
return Model_Mean[:,:,decision_boundary], Model_Std[:,:,decision_boundary], decision_boundary/100
###
###
This diff is collapsed.
......@@ -46,6 +46,6 @@ browser. Select 'ClassifyStorms.ipynb' from the list and run the notebook by cli
project's master branch (destination).
### References
+ The original version of this software is a supplement to Pick et al., 2019, submitted.
+ The original version of this software is a supplement to Pick et al., Earth and Space Science, 2019 (submitted).
+ Details on the HMC index are given in Pick et al., JGR Space Physics, 2019
( http://doi.org/10.1029/2018JA026185, with data published under http://doi.org/10.5880/GFZ.2.3.2018.006 ).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment