Commit ce42be55 authored by Leonie Pick's avatar Leonie Pick
Browse files

Initial push.

parents
# Don't track the following folders
Dump/
# Don't track the following files
Input.nc
# Don't track all files with these endings:
*.pyc
This source diff could not be displayed because it is too large. You can view the blob instead.
# General
import warnings
warnings.filterwarnings('ignore')
import xarray as xr
import numpy as np
import pandas as pd
import datetime as dt
import itertools
import time as time
# IPython
from IPython.display import display
# Matplotlib
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
plt.style.use('bmh')
import matplotlib.gridspec as gridspec
# Scipy
from scipy import interp
from scipy.signal import butter, filtfilt
# Sklearn
import sklearn as sk
from sklearn import metrics, preprocessing
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import Plots as pl
from Imports import *
###
###
def LowpassButter(cutoff, fs, order=1):
nyq = 0.5 * fs
normal_cutoff = cutoff / nyq
b, a = butter(order, normal_cutoff, btype='low', analog=False)
return b, a
###
###
def LowpassFilter(data, cutoff, fs, order=1):
b, a = LowpassButter(cutoff, fs, order=order)
if data.ndim > 1:
y = np.empty((len(data),np.size(data,1)))
for i in range(np.size(data,1)):
y[:,i] = filtfilt(b, a, data[:,i])
else:
y = filtfilt(b, a, data)
return y
###
###
def Search_TargetEvents(HMC, HMC11y, HMC5d, dHMC, HTime, DTime, grid, Save):
###### STEP 1 ## Threshold scaled in dependence on solar cycle phase
Index = HMC
dIndex_h = dHMC
percentile = grid[0]
power = grid[1]
mini_range = grid[2]
thresHMC = grid[3]
scaling = HMC11y/min(HMC11y)
Index_thres1 = np.percentile(HMC,percentile)*scaling**(power)
StormIndices = np.where(np.less(Index, Index_thres1))[0]
#print(np.percentile(HMC,percentile))
########## STEP 2a ## Combine consecutive UT hours into single events
IndexMin = []
IndexDiff = np.diff(StormIndices)
i = 0; j = 0; k = 0
jmax = 0
for i in range(len(IndexDiff)):
if (IndexDiff[i] <= 1 and i < len(IndexDiff)-1): # difference is 1, within interval
if j==0: k+=1
j += 1
elif (IndexDiff[i] > 1): # difference is greater than 1
if j >= jmax:
jmax= j
if j > mini_range:
deriva = dHMC[StormIndices[i-j:i+1]]
deri_sign = np.sign(deriva)
deriva_save1 = np.where(np.diff(deri_sign) == 2)[0]
deriva_save2 = deriva_save1+1
deriva_save = np.hstack([deriva_save1,deriva_save2])
index_save = np.vstack([Index[StormIndices[i-j+np.asarray(deriva_save1)]],Index[StormIndices[i-j+np.asarray(deriva_save2)]]])
rows = np.argmin(index_save,axis=0)
arr = np.array([rows,np.arange(0,np.shape(index_save)[1])])
deriva_save = deriva_save[np.ravel_multi_index(arr, np.shape(index_save))]
IndexMin.extend(StormIndices[i-j+deriva_save].tolist())
else:
IndexMin.append(StormIndices[i-j]+np.argmin(Index[StormIndices[i-j:i+1]]))
if i == len(IndexDiff)-1:
IndexMin.append(StormIndices[i+1])
j = 0
else: # difference is 1, end of interval
IndexMin.append(StormIndices[i-j]+np.argmin(Index[StormIndices[i-j:i+2]]))
# Test here how long j is and do the same as above
#print(jmax)
#print('---')
########## STEP 2b ## Make sure that single events are at least mini_range apart
IndexMin1 = IndexMin[:]
IndexDiff = np.diff(IndexMin1)
for i in range(len(IndexDiff)):
if IndexDiff[i] <= mini_range:
if np.isnan(IndexMin1[i]):
if (IndexMin1[i+1]-IndexMin1[IndexTemp]) <= mini_range:
IndexKeep = np.argmin([Index[IndexMin1[IndexTemp]],Index[IndexMin1[i+1]]])
IndexToss = np.argmax([Index[IndexMin1[IndexTemp]],Index[IndexMin1[i+1]]])
if IndexKeep == 1:
IndexMin1[IndexTemp] = np.nan
IndexTemp = i+IndexKeep
else:
IndexMin1[i+1] = np.nan
else:
IndexTemp = IndexMin1[i+1]
else:
IndexKeep = np.argmin([Index[IndexMin1[i]],Index[IndexMin1[i+1]]])
IndexToss = np.argmax([Index[IndexMin1[i]],Index[IndexMin1[i+1]]])
IndexMin1[i+IndexToss] = np.nan
IndexTemp = i+IndexKeep
IndexMin1 = np.delete(IndexMin1,np.where(np.isnan(IndexMin1))[0])
######### STEP 3 ## Remove events that do not meet "typical" HMC drop
IndexMin2 = []
for i in np.asarray(IndexMin1,dtype=int):
#HMC_base = np.nanmedian([HMC[i-48:i-12+1],HMC[i+12:i+48+1]])
HMC_base = HMC5d[i]
HMC_peak = HMC[i]
HMC_diff = (HMC_peak-HMC_base)
if HMC_diff <= -thresHMC:
IndexMin2.append(i)
pl.Selection(HTime,DTime,HMC,Index_thres1,StormIndices,IndexMin,IndexMin1,IndexMin2,Save)
return IndexMin2
###
###
def Get_TargetEvents(HMC, HMC11y, HMC5d, dHMC, Kp_all, KpHours_all, Training, Time, Date, YearStart, Save):
YearsIndex = np.where(Time[:,0] >= YearStart)[0]
TrClass = Training[:,0]
TrTimeIndex = Training[:,1]
TrFound = Training[:,2]
CIRs = np.where(np.logical_or(TrClass == 0, TrClass == 2))[0]
CMEs = np.where(TrClass==1)[0]
#---#
accepted_loss_CMEs = 40
accepted_loss_CIRs = 40
#---#
## Parameters
percentile = np.arange(29,29.1,1) #percentile = np.arange(8,9,1)
power = np.arange(1,1.1,1) #power = np.arange(0.9,1.0,0.1)
mini_range = np.arange(26,26.1,1) #mini_range = np.arange(78,79,1)
thresHMC = np.arange(7,7.1,1) #thresHMC = np.arange(10,11,1)
grid = np.stack(np.meshgrid(percentile,power,mini_range,thresHMC),-1).reshape(-1,4)
threshold = 0
for i in range(len(grid)):
Storms = Search_TargetEvents(HMC[YearsIndex], HMC11y[YearsIndex], HMC5d[YearsIndex], dHMC[YearsIndex],
Time[YearsIndex,:], Date[YearsIndex], grid[i,:], Save)
Found_CIRs = np.where(np.in1d(TrTimeIndex[np.logical_or(TrClass==0,TrClass==2)],Storms+YearsIndex[0]))[0]
Found_CMEs = np.where(np.in1d(TrTimeIndex[TrClass==1],Storms+YearsIndex[0]))[0]
#IndexWin = i
#StormsWin = Storms+YearsIndex[0]
#CIRIndices = CIRs[Found_CIRs]
#CMEIndices = CMEs[Found_CMEs]
if ((len(Found_CIRs) >= np.floor((len(CIRs)-len(CIRs)*accepted_loss_CIRs/100))) and (len(Found_CMEs) >= np.floor((len(CMEs)-len(CMEs)*accepted_loss_CMEs/100)))):
criterium = (len(Found_CIRs)+len(Found_CMEs))+(len(YearsIndex)-len(Storms))
if criterium > threshold:
IndexWin = i
StormsWin = Storms+YearsIndex[0]
CIRIndices = CIRs[Found_CIRs]
CMEIndices = CMEs[Found_CMEs]
threshold = criterium
TrFound[CIRIndices] = 1
TrFound[CMEIndices] = 1
Training[:,2] = TrFound
#print(grid[IndexWin,:])
#if Save == True:
#np.save('./Dump/Events',StormsWin)
###
TargetResults = {'No. targets':len(StormsWin),'Fraction [%]':len(StormsWin)*100/len(HMC[YearsIndex])}
TrainingResults = [[len(CIRs), len(CMEs), len(CIRs)+len(CMEs)],[len(Found_CIRs),len(Found_CMEs),len(Found_CIRs)+len(Found_CMEs)]]
SelectionParams = {'HMC percentile':grid[IndexWin,0],'Scaling power':grid[IndexWin,1], 'Separation [h]':grid[IndexWin,2], 'Min. HMC drop [nT]':grid[IndexWin,3]}
display(pd.DataFrame(data=SelectionParams,index=['Selection parameters']))
display(pd.DataFrame(data=TargetResults,index=['Selection result']))
display(pd.DataFrame(data=TrainingResults,columns=['No. CIRs','No. CMEs', 'Total'], index=['Training set', 'in Target set']))
###
pl.IndexDist(Time,YearsIndex,StormsWin,Kp_all,KpHours_all,HMC,Save)
return StormsWin
###
###
def Get_F0(i,HMC,HMC_filt1):
HMC_base = HMC_filt1[i]
HMC_peak = HMC[i]
F0 = -(HMC_peak-HMC_base)
return F0
###
###
def Get_F1F2(i,dHMC):
maxPosr = np.argmax(abs(dHMC[i:i+12+1]))
gradr = abs(dHMC[i+maxPosr:i+12+1])
maxPosl = np.argmax(abs(dHMC[i-12:i+1]))
gradl = abs(dHMC[i-12:i-12+maxPosl+1])
F1 = np.nanmean(gradr); F2 = np.nanmean(gradr)-np.nanmean(gradl)
return F1, F2
###
###
def Get_F3F4F5(i, Block4):
Corr = np.zeros((24,2))
lags = np.arange(-12,13,1)
R = np.zeros(len(lags))
m = 0
for lag in lags:
l = 0
CorrEx = np.ones((24))
for n in [0,2]:
if n == 0:
Corr[:,l] = Block4[i+lag:i+lag+24,n]
else:
Corr[:,l] = Block4[i:i+24,n]
Exclude = np.where(np.isnan(Corr[:,l]))[0]
if len(Exclude) > 0:
CorrEx[Exclude] = 0
l += 1
R[m] = np.corrcoef(Corr[CorrEx==1,0],Corr[CorrEx==1,1])[0,1]
m += 1
Bestlag = lags[np.argmax(abs(R))]
grad = np.gradient(Block4[i:i+24,0],1,edge_order=1)
gradM = np.nanmedian(grad)
F3 = (R[12]); F4 = np.ptp(R); F5 = np.nanmedian(abs(grad-gradM))
return F3,F4,F5
###
###
def Get_F6(i, Asy_loc):
F6 = len(np.where(Asy_loc[i-48:i-24+1] <= 17.)[0])
return F6
###
###
def Get_F7F8(i, ASY):
AsyDD_loc = np.nanargmax(ASY[i-2:i+2+1,1])
AsyDD_peak = np.nanmax(ASY[i-2:i+2+1,1])
Decay = np.where(ASY[i-2+AsyDD_loc:,1] < 0.5*AsyDD_peak)[0][0]
AsyM_peak = np.nanmax(ASY[i-24:i+1,0])
F7 = AsyM_peak; F8 = Decay
return F7,F8
###
###
def Get_F9(i,dHMC11y):
F9 = dHMC11y[i]
return F9
###
###
def Get_F10(i,Time,PeakTimes,TrTimeIndex,TrClass,HMC,HMC11y,count1,count2):
Recurrence = int(np.floor(27.27*24))
if i <= len(Time)-Recurrence:
cycles = [-1,1]
else:
cycles = [-1]
m = 0
hit = 0
for l in cycles:
lag = l*Recurrence
interval = np.arange(i+lag-26,i+lag+26+1,1)
test = np.where(np.in1d(interval,PeakTimes))[0]
if len(test) == 1:
hit += 1
# Diagnostics
if i in TrTimeIndex:
pointer = np.where(TrTimeIndex==i)[0][0]
if TrClass[pointer] == 0:
if hit >=1:
count1 += 1
else:
if hit == 0:
count2 += 1
F10 = hit
return F10,count1,count2
###
###
def Get_F11(i,TrTimeIndex,TrClass):
if i in TrTimeIndex:
position = np.where(TrTimeIndex == i)[0][0]
if TrClass[position] == 0:
F11 = np.random.normal(loc=0.25, scale=0.01, size=1)
elif TrClass[position] == 1:
F11 = np.random.normal(loc=0.5, scale=0.01, size=1)
else:
F11 = np.random.normal(loc=0.75, scale=0.01, size=1)
else:
F11 = 0.5
return F11
###
###
def Get_Features(Time, Storms, Training, HMC, HMC11y, HMC1y, dHMC, B_MLT, ASY, Save):
PeakTimes = Storms; PeakIndices = np.arange(0,len(Storms),1)
n_samples = len(PeakTimes)
Features = [0,1,2,3,4,5,6,7,8,9,10,11]
n_features = len(Features)
FeatureMatrix = np.zeros((n_samples,n_features))+np.nan
Block4 = np.zeros((len(Time),4))
Block4[:,0] = np.nanmedian(B_MLT[:,3:9],axis=1) # dawn
Block4[:,1] = np.nanmedian(B_MLT[:,9:15],axis=1) # noon
Block4[:,2] = np.nanmedian(B_MLT[:,15:21],axis=1) # dusk
Block4[:,3] = np.nanmedian(np.hstack([B_MLT[:,21:24],B_MLT[:,0:3]]),axis=1) # midnight
dAsyDD = np.gradient(ASY[:,0],1,edge_order=1)
dHMC11y = np.gradient(HMC11y,1,edge_order=1)
TrTimeIndex = Training[:,1]#Training.sel(Properties='TimeIndex')
TrClass = Training[:,0]#Training.sel(Properties='Class')
excludeEvents = []
count1=0; count2=0
j = 0
for i in PeakTimes:
k = 0
if i <= len(Time)-36 and i >= 27.27*24:
if 0 in Features: # Feature 1:
F0 = Get_F0(i,HMC,HMC1y)
FeatureMatrix[j,k] = F0; k+=1
if 1 in Features or 2 in Features: # Feature 2 & 3:
F1, F2 = Get_F1F2(i, dHMC)
if 1 in Features: FeatureMatrix[j,k] = F1; k+=1
if 2 in Features: FeatureMatrix[j,k] = F2; k+=1
if 3 in Features or 4 in Features or 5 in Features: # Feature 2 & 3:
F3, F4, F5 = Get_F3F4F5(i, Block4)
if 3 in Features: FeatureMatrix[j,k] = F3; k+=1
if 4 in Features: FeatureMatrix[j,k] = F4; k+=1
if 5 in Features: FeatureMatrix[j,k] = F5; k+=1
if 6 in Features: # Feature 7:
F6 = Get_F6(i, ASY[:,2])
FeatureMatrix[j,k] = F6; k+=1
if 7 in Features or 8 in Features: # Features 8 & 9:
F7,F8 = Get_F7F8(i, ASY)
if 7 in Features: FeatureMatrix[j,k] = F7; k+=1
if 8 in Features: FeatureMatrix[j,k] = F8; k+=1
if 9 in Features: # Feature 10:
F9 = Get_F9(i, dHMC11y)
FeatureMatrix[j,k] = F9; k+=1
if 10 in Features: #Feature 11:
F10, count1, count2 = Get_F10(i,Time,PeakTimes,TrTimeIndex,TrClass,HMC,HMC11y,count1,count2)
FeatureMatrix[j,k] = F10; k+=1
if 11 in Features: # Feature 12:
F11 = Get_F11(i,TrTimeIndex,TrClass)
FeatureMatrix[j,k] = F11
else:
excludeEvents.append(j)
j += 1
FeatureMatrix = np.delete(FeatureMatrix,excludeEvents,axis=0)
PeakIndices = np.delete(PeakIndices,excludeEvents)
if np.any(PeakTimes[excludeEvents] in TrTimeIndex):
print('Caution!')
if Save == True:
np.savez('./Dump/Out/Features',Features=FeatureMatrix,FeatureIndices=PeakIndices)
return PeakIndices, FeatureMatrix
###
###
def Get_Diagnostics(FeatureMatrix,Storms,Training,Save):
TrClass = Training[:,0]
TrTimeIndex = Training[:,1]
TrKnown = Training[:,2]
n_features = FeatureMatrix.shape[1]
n_classes = len(np.unique(TrClass))
IndicesD = np.where(np.in1d(Storms,TrTimeIndex))[0]
TargetKnown = np.where(TrKnown==1)[0]
Data = FeatureMatrix[IndicesD,:]
NData = (Data-Data.mean(0))/Data.std(0)
NData2 = (NData-NData.min(0))/NData.ptp(0)
keys = np.arange(0,n_classes+1,1)
TargetClasses = dict(zip(keys, [0,0,0]))
NData = dict(zip(keys, np.zeros(n_classes)))
MData = dict(zip(keys, np.zeros(n_classes)))
SData = dict(zip(keys, np.zeros(n_classes)))
for i in range(n_classes):
TargetClasses[i]=np.where(TrClass[TargetKnown]==i)[0]
#FData[i] = Data[TargetClasses[i],:]
NData[i] = NData2[TargetClasses[i],:]
MData[i] = np.nanmedian(NData[i],axis=0)
SData[i] = np.nanstd(NData[i],axis=0)
FeatureResults = np.zeros((n_features,5))
Ranges = np.zeros((n_features,n_classes,2))
k = 0
for i in [0,3,5,9,10,6,7,8,1,4,2,11]:#range(n_features):
#vall = np.zeros(n_classes); valr = np.zeros(n_classes)
for j in range(n_classes):
vall=np.percentile(NData[j][:,i],25); valr = np.percentile(NData[j][:,i],75)#; val2l = np.percentile(NCMEs[:,i],25); val2r = np.percentile(NCMEs[:,i],75)
FeatureResults[k,j] = np.around(MData[j][i],3)
Ranges[k,j,:] = [vall,valr]
Left = np.argmin(Ranges[k,:,0])
if Left == 0:
Overlap = Ranges[k,0,1]-Ranges[k,1,0]
else:
Overlap = Ranges[k,1,1]-Ranges[k,0,0]
#print(i,np.around(valr-vall,3))
FeatureResults[k,2] = np.around(abs(MData[0][i]-MData[1][i]),3)
FeatureResults[k,3] = np.around(Overlap,3)
FeatureResults[k,4] = np.around(abs(MData[0][i]-MData[1][i])-Overlap,3)
k += 1
###
display(pd.DataFrame(data=FeatureResults,index=np.linspace(1,12,12,dtype=int),columns=['Q2 CIRs','Q2 CMEs','Q2 Difference','IQR Overlap', 'Q2 Difference- IQR Overlap']))
###
pl.Diagnostics(n_features, n_classes, NData, Save)
###
###
def HSS_score(y, y_pred):
TN,FP,FN,TP = metrics.confusion_matrix(y, y_pred, labels=[0,1]).ravel()
HSS = 2*(TP*TN-FN*FP)/((TP+FN)*(FN+TN)+(TP+FP)*(FP+TN))
return HSS
###
###
def Get_Scorer():
mcc_scorer = metrics.make_scorer(metrics.matthews_corrcoef)
hss_scorer = metrics.make_scorer(HSS_score)
scoring = {'MCC': mcc_scorer, 'HSS': hss_scorer}
return scoring
###
###
def Get_Pipelines(whichEstimator,nClasses):
## Pre-processing
Scaler = preprocessing.StandardScaler(with_mean=True,with_std=True)
## Estimators & Feature selection strategy
if whichEstimator == 'LogReg':
Estimator = LogisticRegression(solver='saga',class_weight='balanced',multi_class='auto')
BEstimateParams = {'estimate__C':[1]}
OEstimateParams = {'estimate__C':[0.001,0.01,0.1,1,10],'estimate__penalty':['l1','l2']}
elif whichEstimator == 'SVC':
Estimator = SVC(class_weight='balanced',probability=True)
BEstimateParams = {'estimate__C':[1]}
OEstimateParams = BEstimateParams
elif whichEstimator == 'RandomForest':
Estimator = RandomForestClassifier(class_weight='balanced')
BEstimateParams = {'estimate__n_estimators':[10]}
OEstimateParams = BEstimateParams
elif whichEstimator == 'NeuralNet':
Estimator = MLPClassifier()
BEstimateParams = {'estimate__hidden_layer_sizes':[100]}
OEstimateParams = BEstimateParams
elif whichEstimator == 'LDA':
Estimator = LinearDiscriminantAnalysis()
BEstimateParams = {'estimate__solver':['svd']}
OEstimateParams = BEstimateParams
elif whichEstimator == 'QDA':
Estimator = QuadraticDiscriminantAnalysis()
BEstimateParams = {'estimate__tol':[1.0e-4]}
OEstimateParams = BEstimateParams
## Feature selection
FeatureSelect = RFE(Estimator, step = 1)
SelectParams = {'select__n_features_to_select':[1,2,3,4,5,6,7,8,9,10,11]}
## Pipelines
BPipe = Pipeline([('scale', Scaler),('estimate', Estimator)])
OPipe = Pipeline([('scale', Scaler),('select', FeatureSelect),('estimate', Estimator)])
return BPipe,BEstimateParams,OPipe,SelectParams,OEstimateParams
###
###
def Select_Features(Data,Target,N2,K2,Pipe,SelectParams,EstimateParams,scoring,scoring_name,n_classes,estimators,est,ax,Save):
nFeatures = Data.shape[1]
Scores = np.zeros((N2,nFeatures,2))
Indices = np.zeros((N2))
for i in range(N2):
CV2 = StratifiedKFold(n_splits=K2, shuffle=True, random_state=i)
GS = GridSearchCV(Pipe, SelectParams, n_jobs=1, scoring=scoring, iid='False', cv=CV2, refit=scoring_name, return_train_score=True)
GS.fit(Data,Target)
for name in GS.cv_results_.keys():
if name.startswith('mean'):
if scoring_name in name and 'test' in name:
Scores[i,:,0] = GS.cv_results_[name]
elif scoring_name in name and 'train' in name:
Scores[i,:,1] = GS.cv_results_[name]
Indices[i] = GS.best_index_
#print(GS.best_score_)
#print(GS.cv_results_['params'])
# Fix number of features in estimation parameters
Scores_mean = np.mean(Scores,axis=0)
Scores_std = np.std(Scores,axis=0)
FeaturesSelect = np.argmax(Scores_mean[:,0])+1