Commit dba87f61 authored by Daniel Scheffler's avatar Daniel Scheffler
Browse files

Added Random Forest Regression as new spectral homogenization method (uses 50 trees). Added test.

parent fbb1d692
Pipeline #3187 passed with stage
in 17 minutes and 41 seconds
......@@ -122,7 +122,7 @@ def run_from_constraints(args):
def _run_job(dbJob, **config_kwargs):
# type: (GMS_JOB) -> None
# type: (GMS_JOB, dict) -> None
"""
:param dbJob:
......
......@@ -26,12 +26,14 @@ import traceback
import zipfile
import tempfile
import time
import warnings
from sklearn.cluster import k_means_ # noqa F401 # flake8 issue
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline # noqa F401 # flake8 issue
from sklearn.ensemble import RandomForestRegressor
from geoarray import GeoArray # noqa F401 # flake8 issue
from ..options.config import GMS_config as CFG
......@@ -1246,7 +1248,7 @@ class ClusterClassifier_Generator(object):
@staticmethod
def train_machine_learner(train_X, train_Y, test_X, test_Y, method, **kwargs):
# type: (np.ndarray, np.ndarray, np.ndarray, np.ndarray, str, dict) -> Union[LinearRegression, Ridge, Pipeline]
# type: (np.ndarray, np.ndarray, np.ndarray, np.ndarray, str, dict) -> Union[LinearRegression, Ridge, Pipeline, RandomForestRegressor] # noqa E501 (line too long)
"""Use the given train and test data to train a machine learner and append some accuracy statistics.
:param train_X: reference training data
......@@ -1254,9 +1256,10 @@ class ClusterClassifier_Generator(object):
:param test_X: reference test data
:param test_Y: target test data
:param method: type of machine learning classifiers to be included in classifier collections
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'RFR': Random Forest Regression (50 trees)
:param kwargs: keyword arguments to be passed to the __init__() function of machine learners
"""
###################
......@@ -1338,13 +1341,26 @@ class ClusterClassifier_Generator(object):
:param outDir: output directory for the created cluster classifier collections
:param method: type of machine learning classifiers to be included in classifier collections
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'RFR': Random Forest Regression (50 trees)
:param n_clusters: number of clusters to be used for KMeans clustering
:param CPUs: number of CPUs to be used for KMeans clustering
:param kwargs: keyword arguments to be passed to machine learner
"""
# validate and set defaults
if method == 'RFR':
if n_clusters > 1:
warnings.warn("The spectral homogenization method 'Random Forest Regression' does not allow spectral "
"sub-clustering. Setting 'n_clusters' to 1.")
n_clusters = 1
if 'n_jobs' not in kwargs:
kwargs.update(dict(n_jobs=CPUs))
if 'n_estimators' not in kwargs:
kwargs.update(dict(n_estimators=CFG.spechomo_rfr_n_trees)) # we statically use 50 trees for RFR
# build the classifier collections with separate classifiers for each cluster
for src_cube in self.refcubes: # type: RefCube
cls_collection = nested_dict()
......@@ -1359,8 +1375,9 @@ class ClusterClassifier_Generator(object):
for tgt_cube in self.refcubes:
if (src_cube.satellite, src_cube.sensor) == (tgt_cube.satellite, tgt_cube.sensor):
continue
print("Creating %s cluster classifier to predict %s %s from %s %s..."
% (method, tgt_cube.satellite, tgt_cube.sensor, src_cube.satellite, src_cube.sensor))
clf_str = 'classifier' if n_clusters == 1 else 'cluster classifier'
print("Creating %s %s to predict %s %s from %s %s..."
% (method, clf_str, tgt_cube.satellite, tgt_cube.sensor, src_cube.satellite, src_cube.sensor))
src_derived_LBAs = self._get_derived_LayerBandsAssignments(src_cube.satellite, src_cube.sensor)
tgt_derived_LBAs = self._get_derived_LayerBandsAssignments(tgt_cube.satellite, tgt_cube.sensor)
......@@ -1441,9 +1458,10 @@ def get_machine_learner(method='LR', **init_params):
# type: (str, dict) -> Union[LinearRegression, Ridge, Pipeline]
"""Get an instance of a machine learner.
:param method: 'LR': Linear regression
'RR': Ridge regression
'QR': Quadratic regression
:param method: 'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'RFR': Random Forest Regression (50 trees)
:param init_params: parameters to be passed to __init__() function of the returned machine learner model.
"""
if method == 'LR':
......@@ -1452,6 +1470,8 @@ def get_machine_learner(method='LR', **init_params):
return Ridge(**init_params)
elif method == 'QR':
return make_pipeline(PolynomialFeatures(degree=2), LinearRegression(**init_params))
elif method == 'RFR':
return RandomForestRegressor(**init_params)
else:
raise ValueError("Unknown machine learner method code '%s'." % method)
......@@ -1464,6 +1484,8 @@ def get_machine_learner(method='LR', **init_params):
def get_filename_classifier_collection(method, src_satellite, src_sensor, n_clusters=1):
if method == 'RR':
method += '_alpha1.0' # TODO add to config
elif method == 'RFR':
method += '_trees%s' % CFG.spechomo_rfr_n_trees # we statically use 50 trees for Random Forest Regression
return '__'.join(['%s_clust%s' % (method, n_clusters), src_satellite, src_sensor]) + '.dill'
......@@ -1493,9 +1515,10 @@ class RSImage_ClusterPredictor(object):
"""Get an instance of RSImage_ClusterPredictor.
:param method: machine learning approach to be used for spectral bands prediction
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'RFR': Random Forest Regression (50 trees; does not allow spectral sub-clustering)
:param n_clusters: Number of spectral clusters to be used during LR/ RR/ QR homogenization.
E.g., 50 means that the image to be converted to the spectral target sensor
is clustered into 50 spectral clusters and one separate machine learner per
......@@ -1520,6 +1543,12 @@ class RSImage_ClusterPredictor(object):
self.CPUs = CPUs
self.classif_alg = classif_alg
# validate
if method == 'RFR' and n_clusters > 1:
warnings.warn("The spectral homogenization method 'Random Forest Regression' does not allow spectral sub-"
"clustering. Setting 'n_clusters' to 1.")
self.n_clusters = 1
def get_classifier(self, src_satellite, src_sensor, src_LBA, tgt_satellite, tgt_sensor, tgt_LBA):
# type: (str, str, list, str, str, list) -> Cluster_Learner
"""Select the correct machine learning classifier out of previously saves classifier collections.
......
......@@ -330,6 +330,7 @@ class JobConfig(object):
self.exec_L2BP = gp('exec_L2BP')
self.spechomo_method = gp('spechomo_method')
self.spechomo_n_clusters = gp('spechomo_n_clusters')
self.spechomo_rfr_n_trees = 50 # this is static confic value, not a user option
self.spechomo_classif_alg = gp('spechomo_classif_alg')
self.spechomo_kNN_n_neighbors = gp('spechomo_kNN_n_neighbors')
self.spechomo_estimate_accuracy = gp('spechomo_estimate_accuracy')
......
......@@ -153,7 +153,9 @@
LI: Linear interpolation;
LR: Linear regression;
RR: Ridge regression;
QR: Quadratic regression*/
QR: Quadratic regression
RFR: Random forest regression with 50 trees
(no spectral sub-clustering available)*/
"spechomo_n_clusters": 50, /*Number of spectral clusters to be used during LR/ RR/ QR homogenization.
E.g., 50 means that the image to be converted to the spectral target sensor
is clustered into 50 spectral clusters and one separate machine learner per
......
......@@ -124,7 +124,7 @@ gms_schema_input = dict(
run_processor=dict(type='boolean', required=False),
write_output=dict(type='boolean', required=False),
delete_output=dict(type='boolean', required=False),
spechomo_method=dict(type='string', required=False, allowed=['LI', 'LR', 'RR', 'QR']),
spechomo_method=dict(type='string', required=False, allowed=['LI', 'LR', 'RR', 'QR', 'RFR']),
spechomo_n_clusters=dict(type='integer', required=False, allowed=[1, 5, 10, 15, 20, 30, 40, 50]),
spechomo_classif_alg=dict(type='string', required=False, allowed=['MinDist', 'kNN', 'SAM']),
spechomo_kNN_n_neighbors=dict(type='integer', required=False, min=0),
......
......@@ -108,7 +108,7 @@ class Test_ClusterClassifier_Generator(unittest.TestCase):
@classmethod
def setUpClass(cls):
set_config(job_ID=26186196, db_host=db_host, reset_status=True, is_test=True)
cls.config = set_config(job_ID=26186196, db_host=db_host, reset_status=True, is_test=True)
cls.tmpOutdir = tempfile.TemporaryDirectory()
@classmethod
......@@ -167,6 +167,21 @@ class Test_ClusterClassifier_Generator(unittest.TestCase):
self.assertIsInstance(undilled, dict)
self.assertTrue(bool(undilled), msg='Generated classifier collection is empty.')
def test_create_classifiers_RFR(self):
"""Test creation of random forest regression classifiers."""
CCG = ClusterClassifier_Generator([refcube_l8, refcube_l5])
CCG.create_classifiers(outDir=self.tmpOutdir.name, method='RFR', n_clusters=1,
**dict(n_jobs=-1, n_estimators=20))
outpath_cls = os.path.join(self.tmpOutdir.name,
'RFR_trees%d_clust1__Landsat-8__OLI_TIRS.dill' % self.config.spechomo_rfr_n_trees)
self.assertTrue(os.path.exists(outpath_cls))
with open(outpath_cls, 'rb') as inF:
undilled = dill.load(inF)
self.assertIsInstance(undilled, dict)
self.assertTrue(bool(undilled), msg='Generated classifier collection is empty.')
class Test_SpectralHomogenizer(unittest.TestCase):
"""Tests class for gms_preprocessing.algorithms.L2B_P.Test_SpectralHomogenizer"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment