Commit f73ad39a authored by Janis Jatnieks's avatar Janis Jatnieks
Browse files

Intial commit

parents
.Rdata
.Rhistory
*.pyc
*.bak
# Surrogate experiments controller
# Author: Janis Jatnieks, janis.jatnieks@gfz-potsdam.de, +49(0)157 3245 1188
require(stringr)
require(data.table)
# This calls all the other dependencies
source("Surrogate_playground.R")
# read the data tables
IN_training_data_path = "train/100_IN.csv"
OUT_training_data_path ="train/100_OUT.csv"
INtable = fread(IN_training_data_path)
OUTtable = fread(OUT_training_data_path)
# subset columns
INselection =c("C","Ca","Cl","Mg","N","Na","S","Si","H*","O*","Charge","Pressure","MatUnsatPorosity","UD_p4f","UD_pH","UD_pe","UD_mass_H2O","UD_charge(eq)","UD_O(0)(mol/kgw)","UD_Ca(mol/kgw)","UD_Calcite","UD_Kaolinite","UD_si_Calcite","UD_si_Gypsum","UD_si_K-Feldspar", "UD_si_Kaolinite","Time")
OUTselection=c("C","Ca","Cl","Mg","N","Na","S","Si","H*","O*","Charge","Pressure","MatUnsatPorosity","UD_p4f","UD_pH","UD_pe","UD_mass_H2O","UD_charge(eq)","UD_O(0)(mol/kgw)","UD_Ca(mol/kgw)","UD_Calcite","UD_Kaolinite","UD_si_Calcite","UD_si_Gypsum","UD_si_K-Feldspar", "UD_si_Kaolinite","Time")
INtable = subset(INtable, select=INselection )
OUTtable = subset(OUTtable,select=OUTselection)
# run the model fitting, validation and selection work-flow with given methods
Main(c("MARS","MARS3","PolyMARS","PolyMARS3","mda_p01","polymars_p01","brnn","qrnn","lars2","lasso","lars")
,preprocessing_ind = c(17) # no pre-processing
,input_data = INtable
,output_data = OUTtable
,seed = 105
# use of parallelization for training, validation and preprocessing steps
,train_para = T, run_para = F, preproc_para = T
,selection_criteria = "RMSE"
# drops any such named column from input as well as output
,exclude_columns = c("Head")
# makes sure concentration values are positive
,allow_neg_cols= c("Pressure","H*","O*")
# allow some columns to have negative values
,allow_neg_cols= T
# assumes this will be passed in from the model coupling to the Surrogate at run-time
,external = c("Time","Pressure","MatUnsatPorosity","Saturation","Temp","MatHeatPorosity","MatFlowStorativity","UD_p4f")
# dump error tables
,write_full_residuals=T
# run caret tuning routines
,tuner=T
# use this much for training
,training_samples = 0.6
)
# do a quick check the resulting surrogate model actually runs
qtestin <- as.matrix( head(INtable,1000)
qres <-data.table( SelectedSurrogate(qtestin) )
cat ("\n\n")
print (head( qres )
print (head( OUTtable )
This diff is collapsed.
This diff is collapsed.
## extract table from Marco's Rphree datastructures
Reshaping <- function(data) {
if (missing(data)) stop("Reshaping called with no argument!")
## prepare IO data from T and C matrices
## extract original data from RT simulations
cat('Extracting original data...')
Oin <<- TByElem( list_of_elems=data, nested_index=1 )
Oout <<- TByElem( list_of_elems=data, nested_index=2 )
}
## load externally dumped data with autodetection
Loadata <- function( IN="dumps/dumpIN.txt", OUT="dumps/dumpOUT.txt" ) {
cat('Loading INPUT-OUTPUT tables...')
Oin <<- fread( IN)
Oout <<- fread(OUT)
if ( identical(Oin,Oout) ) {stop("FAIL! INPUT-OUTPUT DATA TABLES IDENTICAL!")}
}
# This is an idea worth testing - we care about most of these
# values until a certain level of digits.
# However, model drift can cause some of the values to be out of
# input ranges that the prediction model is able to deal with
# reasonably well. The default option would be to resampl the
# simulator in those regions. A quicker heuristic also worth trying
# is to round the tables to coarser "resolution" of numbers that
# are meaningful for the scenario and fit those as alternative
# "extrapolating" options.
MultiRound <- function( t, rounding_digits=c(8,7,6) ) {
out_table <- t
for (digits in rounding_digits) { out_table <- rbind( out_table, smartround(t, digits) ) }
return ( out_table )
}
# additional cleaning operations
Filtering <- function(
roundto=Inf, # setting Inf switches rounding off
dedup=T,
exclude=numeric(),
exclude_output=c(),
try_multi_round = c(),
to_factor = F, # convert columns with low number of unique valus to factors?
# F - all neg forced to zero
# T - all allowed neg
# c("field","names") - allow only these cols to be negative
allow_negative = T#c("Pressure","Charge")
#exclude=6
) {
# if negatives are forced to zero then we start with that
if ( class(allow_negative)=="logical" ) {
if (!allow_negative) { # of F then all are forced to 0
cat('NEGATIVES: Clipping all negative values in all data to zero...be sure this is really what you want!!!')
Oin[Oin<0]=0
Oout[Oout<0]=0
}
} else { # if field names are specified then the rest are also clipped to zero
Oin <<- AllowSomeNegativeColumns( Oin, allow_negative)
Oout<<- AllowSomeNegativeColumns( Oout,allow_negative)
}
# make sure there is actually data, if none of the filtering options are actually run
Fin <<- Oin
Fout<<- Oout
# deduplication works by syncing indexes in the two tables
cat('excluding...')
if ( length(exclude) > 0 ) {
#Oin <<- subset(Oin, select = setdiff(colnames(Oin), exclude) )
#Oout <<- subset(Oout,select = setdiff(colnames(Oout),exclude) )
Fin <- exc(Oin, exclude)
Fout <- exc(Oout,exclude)
}
cat('also outputs',exclude_output,"...")
if ( length(exclude) > 0 ) {
Fout <- exc(Fout,exclude_output)
}
if (!roundto==Inf) {
cat("\nrounding...")
Fin <- smartround( Fin,roundto )
Fout <- smartround( Fout,roundto )
}
if ( length(try_multi_round) > 0 ) {
cat("multirounding...")
Fin <- MultiRound( Oin,try_multi_round )
Fout <- MultiRound( Oout,try_multi_round )
}
cat("checking for static input columns...")
Fin <- SelectActiveColumns(Fin)
Fout <- SelectActiveColumns(Fout)
# remove non-unique parameter combinations after filtering and rounding
if (dedup) {
cat("deduplicating...")
dups <<- duplicated(Fin)
Fin <<- Fin[ which(dups==F), ]
Fout <<- Fout[ which(dups==F), ]
redundancy=(sum(dups==T)/nrow(Oin))*100
cat( redundancy,"%...")
}
if (to_factor) {
cat("looking for OUTPUT columns that could be converted to factors for using classification instead of regression...\n")
Fout <<- Cols2Fact(Fout)
}
cat("IN-OUT different?...")
n_the_same = NumericallyCompareTables(Oin,Oout)
n_common_colnames = length( intersect(colnames(Oin),colnames(Oout)) )
if ( n_the_same/n_common_colnames > 0.8 & n_common_colnames>1 ) stop (" ERROR: TOO MANY IDENTICAL IN-OUT COLUMNS, CHECK YOUR DATA!\n")
cat('done!\n')
}
AllowSomeNegativeColumns <- function( dfrmlike, allow_negative_cols ) {
if ( length(allow_negative_cols)>0) {
neg_cols_actually_exist = allow_negative_cols[ allow_negative_cols %in% colnames(dfrmlike) ] }
if ( length(neg_cols_actually_exist) > 0 ) {
temp1 <- subset(dfrmlike, select = neg_cols_actually_exist) # prevent <0
}
temp2 <- subset(dfrmlike, select = setdiff(colnames(dfrmlike),allow_negative_cols) )
temp2[temp2<0]<-0
if ( length(neg_cols_actually_exist) > 0 ) { allcols <- cbind(temp2,temp1) }
else allcols <- temp2
return(allcols)
}
Relcal <- function() {
# only do this on the performance table columns that are numerics and therefore suitable
relacols <<- colnames(Sperf)[laply( Sperf,.fun=class )=="numeric"]
#todo: finish (round(colwise_rescale(subset(Sperf,select=relacols),minval=0,maxval=1),5)-1)*-1
llply( relacols, .fun=function(colname) {
colvals <<- dt2v(Sperf,colname)
#cat(colname,colvals/sort(colvals)[1],"\n")
round(0.0e-32 - ( colvals / max(colvals))*-1,5)
#Sperf[ paste0("relat_",colname):=colvals/min(colvals) ]
})
}
# I enumerate them here so that no large text blocks
# would repeat in pre- and post- processing code
specific_preprocessing_methods <<- c(
"(0,1) scaled rolling deltas", # 1
"(-1,1) scaled rolling deltas", # 2, usually not worth it
"z-scaled rolling deltas", # 3
"z-scaled", # 4
"deltas on z-scaled", # 5
"deltas on (0,1) scaled", # 6
"deltas on (-1,1) scaled", # 7, usually not worth it
"(0,1) scaled", # 8
"(0,10) scaled", # 9
"(-10,10) scaled", # 10
"(0,10) scaled rolling deltas", # 11
"(-10,10) scaled rolling deltas", # 12
"(0,1) scaled substract", # 13
"(0,10) scaled substract", # 14
"(-10,10) scaled substract", # 15
"z-scaled substract", # 16
"no", # 17
"pca", # 18
"ica", # 19
"BoxCox", # 20
"YeoJohnson", # 21
"expoTrans", # 22
"range", # 23
"scale", # 24
"spatialSign", # 25
"center and scale" # 26
)
# caret has own pre-processing infrastructure but not everything does
# also, we may wish to experiment a little with experiment-specifc preprocessing
SelectPreProc <- function(method=8,IOranges=originalIOranges,v=T#,
# if this is passed in, that means we need to preprocess it
# using previously stored ranges
#given_input=NULL
) {
mrm( c("PPC","PPCout"), verbose=F )
if (v) { cat('\n\nApplying preprocessing with',specific_preprocessing_methods[method],'...') }
#cat("\nOriginal preproc:\n")
#print(str(original_preproc))
#print(original_preproc)
# Here we create the preprocessing method as a function that can be uniformly called globally.
# Most often only input may need to be normalized, but sometimes some of the approaches may
# require the optional original range parameters (original min max in case of stretch and center
# and scale in case of z-scaling) this is very imporant because when the surrogate model is
# used instead of simulation model and gets a table of inputs for which it needs to predict
# the simulator outputs, its normalization has to use the same parameters that where used for
# the original training data, as otherwise the predictor actually gets input that is different
# than what we may think it gets, causing wrong predictions for purely technical reasons.
# Overall there are some general ways to use preprocessing
# 1. preprocess only inputs, usually scale something using given input and output ranges
# ==> defines PPC function for use on input tables
# 2. preprocess also output table in some way unrelated to the input table
# ==> defines PPCout function and PPCback backtransform functions
# ==> defines PPCout function for use on input tables
# 3. preprocess both tables using some formula between themselves (like substract input from output)
# this is complicated, costly and possibly wrong, TODO: need to discuss
# 4. do nothing :) - sometimes best as some methods actually like this:)
if (method==1) { PPC <<- function(inp) { rescaling( Deltas(inp), IOranges, 0,1) }
PPCout <<- function(inp,outp) { rescaling( Deltas(outp),IOranges, 0,1) }}
if (method==2) { PPC <<- function(inp) { rescaling( Deltas(inp), IOranges, -1,1) }
PPCout <<- function(inp,outp) { rescaling( Deltas(outp),IOranges, -1,1) }}
if (method==3) { PPC <<- function(inp) { scale( Deltas(inp) ) }}
# PPCout <<- function(inp,outp) { rescaling( Deltas(outp),IOranges, -1,1) }}
if (method==4) { PPC <<- function(inp) { scale( inp ) }}
if (method==5) { PPC <<- function(inp) { Deltas( scale(inp) ) }}
if (method==6) { PPC <<- function(inp) { Deltas( rescaling(inp, 0,1) ) }
PPCout <<- function(inp,outp) { Deltas( rescaling(outp, IOranges,0,1)) }}
if (method==7) { PPC <<- function(inp) { Deltas( rescaling(inp, IOranges,-1,1) ) }
PPCout <<- function(inp,outp) { Deltas( rescaling(outp,-1,1) ) }}
if (method==8) { PPC <<- function(inp) { rescaling( inp, IOranges, 0,1) }}
if (method==9) { PPC <<- function(inp) { rescaling( inp, IOranges, 0,10) }}
if (method==10){ PPC <<- function(inp) { rescaling( inp, IOranges, -10,10) }}
if (method==11){ PPC <<- function(inp) { colwise_rescale( Deltas( inp ), 0,10) }}
if (method==12){ PPC <<- function(inp) { colwise_rescale( Deltas( inp ),-10,10) }}
if (method==13){ PPC <<- function(inp) { rescaling( inp, IOranges, 0,1) }
PPCout <<- function(inp,outp) { rescaling( outminus(inp,outp), IOranges, 0,1) }}
if (method==14){ PPC <<- function(inp) { rescaling( inp, IOranges, 0,10) }
PPCout <<- function(inp,outp) { rescaling( outminus(inp,outp), IOranges, 0,10) }}
if (method==15){ PPC <<- function(inp) { rescaling( inp, -10,10) }
PPCout <<- function(inp,outp) { rescaling( outminus(inp,outp), IOranges, -10,10) }}
if (method==16){ PPC <<- function(inp) { scale( inp ) }
PPCout <<- function(inp,outp) { scale( outminus(inp,outp) ) }}
if (method==17){ PPC <<- function(inp) { inp }} # doing nothing may also be worth chekcing out :)
# somewhat confusingly the predict call takes previously stored pre-processing transform to apply to supplied data
# also, here we create a function PPC that will actually do the transform when needed!
if (method==18){ PPC <<- function(inp) { predict(original_preproc[["pca"]],inp) }}
if (method==19){ PPC <<- function(inp) { predict(original_preproc[["ica"]],inp) }}
if (method==20){ PPC <<- function(inp) { predict(original_preproc[["BoxCox"]],inp) }}
if (method==21){ PPC <<- function(inp) { predict(original_preproc[["YeoJohnson"]],inp) }}
if (method==22){ PPC <<- function(inp) { predict(original_preproc[["expoTrans"]],inp) }}
if (method==23){ PPC <<- function(inp) { predict(original_preproc[["range"]],inp) }}
if (method==24){ PPC <<- function(inp) { predict(original_preproc[["scale"]],inp) }}
if (method==25){ PPC <<- function(inp) { predict(original_preproc[["spatialSign"]],inp) }}
if (method==26){ PPC <<- function(inp) { predict(original_preproc[["c(\"center\", \"scale\")"]],inp) }}
}
NumericallyCompareTables <- function( in_table, out_table ) {
common_cols = intersect(colnames(in_table),colnames(out_table) )
table_diff <- sapply(common_cols,FUN=function(col) {
coldiff <- dt2v(in_table,col)-dt2v(out_table,col)
any(coldiff > 0)
} )
cat("Completely identical IN and OUT columns =",sum(table_diff==F),":",names(table_diff)[table_diff==F],"\n")
return(sum(table_diff==F))
}
SubstractCommonTableCols <- function(tab1,tab2) {
common_cols = intersect(colnames(tab1),colnames(tab2) )
sapply(common_cols,FUN=function(col) { dt2v(tab1,col)-dt2v(tab2,col) } )
}
# Creates Tin and Tout functions for
BatchPreProc_ <- function() {
# if special output preprocessor function then uses dependencies beteen IO is defined, then use it
if ( exists("PPCout") ) {
cat('\nSpecial output preprocessing used!\n')
Tout <<- data.matrix( PPCout(Fin, Fout ) )
}
else { # otherwise just use the filtered output
Tout <<- data.matrix( Fout )
} # if not than just make sure it is a matrix
Tin <<- PPC(data.matrix( Fin ) )
}
# Creates Tin and Tout functions for
BatchPreProc <- function(selcol=c()) {
# if special output preprocessor function then uses dependencies beteen IO is defined, then use it
if ( length(selcol)>0 ) {
cat("Selecting",selcol,"from output\n")
selout <- dt2v( Fout, selcol )
}
else {
selout <- Fout
}
if ( exists("PPCout") ) {
cat('\nSpecial output preprocessing used!\n')
Tout <<- data.matrix( PPCout( Fin, selout ) )
}
else { # otherwise just use the filtered output
Tout <<- data.matrix( selout )
} # if not than just make sure it is a matrix
Tin <<- PPC( data.matrix(Fin) )
}
# This is someting very critically important:
# when we get the overall original dataset in the beginning, we
# have to store initial ranges and parameters for all
# preprocessing options that could be used. This is because we
# will want to re-use outputs again as inputs and when they are passed
# again back to the predictor. Also, if values outside the original
# ranges get passed to the predictor, the normalization basis should
# not be confused by this. This will not adversly affect range, scale and the
# usual preprocessors, but could adversly affect distribution based methods!
InitPreProc <- function(dfrm,
pre_proc_methods_inds=c(17)
#caret_methods=list("scale")#,"range","BoxCox",#"expoTrans",#"YeoJohnson","pca","spatialSign",
#c('center','scale')),
,para=T
) {
cat("\nInitilizing preprocessors...")
preproc_names <- specific_preprocessing_methods[pre_proc_methods_inds]
caret_methods <<- sapply(preproc_names,FUN=function(ppn) {
out <- ppn
if (ppn=="center and scale") { out = c('center','scale') }
if (ppn=="smart") { out = c('zv','nzv','conditionalX') }
return( out ) }, simplify = T,USE.NAMES=T)
if ( is.na(caret_methods[1]) ) { caret_methods <<- caret_methods[-1] }
if ( ("no" %in% names(caret_methods) ) & length(caret_methods)>1 ) {
caret_methods <<- caret_methods[2:length(caret_methods)]
caret_methods <<- as.character(caret_methods)
print(caret_methods)
preproc <- llply(caret_methods,.fun=function(m) { preProcess(dfrm,m) },.parallel=para)
names(preproc) <- caret_methods
} else { preproc = list() }
return(preproc)
}
# this does generally wrap the caret downsampling capabilities, but tries to do the same for
# regression by casting the outputs as char classes
DownSampler <- function( ins,outs ) {
ur <- coluniratio(outs)
cat("\nOutput column has",ur*100,"% unique values.\nWill downsample the data now...")
cat("\ncasting to char...")
clasouts <- as.factor(as.character(as.matrix(outputs)))
cat("downsampling...")
dr <- downSample(inputs,clasouts,list=T)
cat("got",nrow(dr$x),"downsampled samples!")
return(dr)
}
OverrideValueRange <-function( intable, minval=0.23, maxval=0.23, newval=0.0981 ) {
cat("\nConstraining value range to")
intable[intable<=minval]=newval
intable[intable>=minval]=newval
cat("\nWill use",length(after_outliers_idxes),"rows in training")
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment