Commit da7f0395 authored by Daniel Scheffler's avatar Daniel Scheffler
Browse files

Added random_state to KMeansRSImage.get_random_spectra_from_each_cluster()....

Added random_state to KMeansRSImage.get_random_spectra_from_each_cluster(). Added kwargs to classify_image() to allow passing init args to classifiers. Fixed CFG.spechomo_n_clusters, CFG.spechomo_classif_alg and CFG.spechomo_kNN_n_neighbors ignored in spectral homogenization. Added test_predict_by_machine_learner__RFR_L8_S2().
parent 0bafa7a4
Pipeline #3751 failed with stage
in 18 minutes and 48 seconds
......@@ -132,6 +132,9 @@ class L2B_object(L2A_object):
tgt_satellite=tgt_sat,
tgt_sensor=tgt_sen,
tgt_LBA=tgt_LBA,
n_clusters=CFG.spechomo_n_clusters,
classif_alg=CFG.spechomo_classif_alg,
kNN_n_neighbors=CFG.spechomo_kNN_n_neighbors,
nodataVal=self.arr.nodata,
compute_errors=CFG.spechomo_estimate_accuracy,
bandwise_errors=CFG.spechomo_bandwise_accuracy,
......@@ -224,6 +227,7 @@ class SpectralHomogenizer(object):
'LR': Linear Regression
'RR': Ridge Regression
'QR': Quadratic Regression
'RFR': Random Forest Regression (50 trees; does not allow spectral sub-clustering)
:param src_satellite: source satellite, e.g., 'Landsat-8'
:param src_sensor: source sensor, e.g., 'OLI_TIRS'
:param src_LBA: source LayerBandsAssignment
......@@ -254,15 +258,15 @@ class SpectralHomogenizer(object):
:rtype: Tuple[np.ndarray, Union[np.ndarray, None]]
"""
# TODO: add LBA validation to .predict()
# if n_clusters > 1:
PR = RSImage_ClusterPredictor(method=method,
classifier_rootDir=self.classifier_rootDir,
n_clusters=n_clusters,
classif_alg=classif_alg,
kNN_n_neighbors=kNN_n_neighbors)
# else:
# PR = RSImage_Predictor(method=method,
# classifier_rootDir=self.classifier_rootDir)
kw = dict(method=method,
classifier_rootDir=self.classifier_rootDir,
n_clusters=n_clusters,
classif_alg=classif_alg)
if classif_alg == 'kNN':
kw['n_neighbors'] = kNN_n_neighbors
PR = RSImage_ClusterPredictor(**kw)
######################
# get the classifier #
......@@ -599,7 +603,7 @@ class KMeansRSImage(object):
for label in range(self.n_clusters):
cluster_subset = df[df.cluster_label == label].loc[:, 'B1':]
# get random sample while filling it with duplicates of the same sample when cluster has not enough spectra
random_samples[label] = np.array(cluster_subset.sample(samplesize, replace=True))
random_samples[label] = np.array(cluster_subset.sample(samplesize, replace=True, random_state=20))
return random_samples
......@@ -1527,9 +1531,9 @@ class ClassifierCollection(object):
class RSImage_ClusterPredictor(object):
"""Predictor class applying the predict() function of a machine learning classifier described by the given args."""
def __init__(self, method='LR', n_clusters=50, classif_alg='MinDist', kNN_n_neighbors=10, classifier_rootDir='',
CPUs=1, logger=None):
# type: (str, int, str, int, str, Union[None, int], logging.Logger) -> None
def __init__(self, method='LR', n_clusters=50, classif_alg='MinDist', classifier_rootDir='',
CPUs=1, logger=None, **kw_clf_init):
# type: (str, int, str, str, Union[None, int], logging.Logger, dict) -> None
"""Get an instance of RSImage_ClusterPredictor.
:param method: machine learning approach to be used for spectral bands prediction
......@@ -1549,20 +1553,19 @@ class RSImage_ClusterPredictor(object):
'kNN': k-nearest-neighbour
'SAM': spectral angle mapping
'SID': spectral information divergence
:param kNN_n_neighbors: The number of neighbors to be considered in case 'classif_alg' is set to
'kNN'. Otherwise, this parameter is ignored.
:param classifier_rootDir: root directory where machine learning classifiers are stored.
:param CPUs: number of CPUs to use
:param logger: instance of logging.Logger()
:param kw_clf_init keyword arguments to be passed to classifier init functions if possible
"""
self.method = method
self.n_clusters = n_clusters
self.kNN_n_neighbors = kNN_n_neighbors
self.classifier_rootDir = os.path.abspath(classifier_rootDir)
self.classif_map = None
self.CPUs = CPUs
self.classif_alg = classif_alg
self.logger = logger or GMS_logger(__name__) # must be pickable
self.kw_clf_init = kw_clf_init
# validate
if method == 'RFR' and n_clusters > 1:
......@@ -1570,8 +1573,8 @@ class RSImage_ClusterPredictor(object):
"sub-clustering. Setting 'n_clusters' to 1.")
self.n_clusters = 1
if self.classif_alg == 'kNN' and self.n_clusters < self.kNN_n_neighbors:
self.kNN_n_neighbors = self.n_clusters
if self.classif_alg == 'kNN' and 'n_neighbors' in kw_clf_init and self.n_clusters < kw_clf_init['n_neighbors']:
self.kw_clf_init['n_neighbors'] = self.n_clusters
def __getstate__(self):
"""Defines how the attributes of ReferenceCube_Generator instances are pickled."""
......@@ -1649,20 +1652,22 @@ class RSImage_ClusterPredictor(object):
# ensure image.nodata is present (important for classify_image() -> overwrites cmap at nodata positions)
image.nodata = in_nodataVal if in_nodataVal is not None else image.nodata # might be auto-computed here
# assign each input pixel to a cluster (compute classfication with cluster centers as endmembers)
# assign each input pixel to a cluster (compute classification with cluster centers as endmembers)
if not self.classif_map:
if self.n_clusters > 1:
t0 = time.time()
self.classif_map = classify_image(image,
classifier.cluster_centers,
classifier.cluster_pixVals,
classif_alg=self.classif_alg,
kNN_n_neighbors=self.kNN_n_neighbors,
in_nodataVal=image.nodata,
cmap_nodataVal=cmap_nodataVal, # written into classif_map at nodata
CPUs=self.CPUs)
kw_clf = dict(classif_alg=self.classif_alg,
in_nodataVal=image.nodata,
cmap_nodataVal=cmap_nodataVal, # written into classif_map at nodata
CPUs=self.CPUs,
**self.kw_clf_init)
self.classif_map = classify_image(image, classifier.cluster_centers, classifier.cluster_pixVals,
**kw_clf)
self.logger.info('Total classification time: %s'
% time.strftime("%H:%M:%S", time.gmtime(time.time() - t0)))
else:
self.classif_map = np.full((image.rows, image.cols), classifier.cluster_pixVals[0], np.int8)
......
......@@ -94,8 +94,8 @@ class MinimumDistance_Classifier(_ImageClassifier):
NOTE: distance equation: D² = sqrt(sum((Xvi - Xvj)²)
"""
def __init__(self, train_spectra, train_labels, CPUs=1):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None]) -> None
def __init__(self, train_spectra, train_labels, CPUs=1, **kwargs):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], dict) -> None
if CPUs is None or CPUs > 1:
CPUs = 1 # The NearestCentroid seens to parallelize automatically. So using multiprocessing is slower.
......@@ -104,7 +104,7 @@ class MinimumDistance_Classifier(_ImageClassifier):
self.clf_name = 'minimum distance (nearest centroid)'
self.clf = NearestCentroid()
self.clf = NearestCentroid(**kwargs)
self.clf.fit(train_spectra, train_labels)
def _predict(self, tilepos):
......@@ -117,13 +117,13 @@ class MinimumDistance_Classifier(_ImageClassifier):
class kNN_Classifier(_ImageClassifier):
def __init__(self, train_spectra, train_labels, CPUs=1, n_neighbors=10):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], int) -> None
def __init__(self, train_spectra, train_labels, CPUs=1, **kwargs):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], dict) -> None
super(kNN_Classifier, self).__init__(train_spectra, train_labels, CPUs=CPUs)
self.clf_name = 'k-nearest neighbour (kNN)'
self.clf = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=1)
self.clf = KNeighborsClassifier(n_jobs=1, **kwargs)
self.clf.fit(train_spectra, train_labels)
def _predict(self, tilepos):
......@@ -241,8 +241,8 @@ class SID_Classifier(_ImageClassifier):
class RF_Classifier(_ImageClassifier):
"""Random forest classifier."""
def __init__(self, train_spectra, train_labels, CPUs=1, n_estimators=100, max_depth=2, random_state=0, **kw):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], int, int, int, dict) -> None
def __init__(self, train_spectra, train_labels, CPUs=1, **kwargs):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], dict) -> None
# if CPUs is None or CPUs > 1:
# CPUs = 1 # The NearestCentroid seems to parallelize automatically. So using multiprocessing is slower.
......@@ -250,9 +250,7 @@ class RF_Classifier(_ImageClassifier):
super(RF_Classifier, self).__init__(train_spectra, train_labels, CPUs=CPUs)
self.clf_name = 'random forest'
self.clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state,
n_jobs=1, **kw)
self.clf = RandomForestClassifier(n_jobs=1, **kwargs)
self.clf.fit(train_spectra, train_labels)
def _predict(self, tilepos):
......@@ -264,8 +262,8 @@ class RF_Classifier(_ImageClassifier):
return tilepos, self.clf.predict(spectra).reshape(*tileimdata.shape[:2])
def classify_image(image, train_spectra, train_labels, classif_alg,
kNN_n_neighbors=10, in_nodataVal=None, cmap_nodataVal=None, tiledims=(1000, 1000), CPUs=None):
def classify_image(image, train_spectra, train_labels, classif_alg, in_nodataVal=None, cmap_nodataVal=None,
tiledims=(1000, 1000), CPUs=None, **kwargs):
# type: (Union[np.ndarray, GeoArray], np.ndarray, Union[np.ndarray, List[int]], str, int, ...) -> GeoArray
"""Classify image to find the cluster each spectrum belongs to.
......@@ -279,24 +277,25 @@ def classify_image(image, train_spectra, train_labels, classif_alg,
'SAM': spectral angle mapping
'SID': spectral information divergence
'RF': random forest
:param kNN_n_neighbors: The number of neighbors to be considered in case 'classif_alg' is set to
'kNN'. Otherwise, this parameter is ignored.
:param in_nodataVal:
:param cmap_nodataVal:
:param tiledims:
:param CPUs: number of CPUs to be used for classification
:param kwargs: keyword arguments to be passed to classifiers if possible
"""
if classif_alg == 'kNN':
clf = kNN_Classifier(
train_spectra,
train_labels,
CPUs=CPUs,
n_neighbors=kNN_n_neighbors)
**kwargs)
elif classif_alg == 'MinDist':
clf = MinimumDistance_Classifier(
train_spectra,
train_labels,
CPUs=CPUs)
CPUs=CPUs,
**kwargs)
elif classif_alg == 'SAM':
clf = SAM_Classifier(
......@@ -312,7 +311,7 @@ def classify_image(image, train_spectra, train_labels, classif_alg,
clf = RF_Classifier(
train_spectra,
train_labels,
CPUs=CPUs)
CPUs=CPUs, **kwargs)
else:
raise NotImplementedError("Currently only the methods 'kNN', 'MinDist', 'SAM', 'SID' and 'RF' are implemented.")
......
......@@ -274,7 +274,7 @@ class Test_SpectralHomogenizer(unittest.TestCase):
self.assertEqual(errors.dtype, np.int16)
def test_predict_by_machine_learner__QR_cluster_L8_S2(self):
"""Test quadratic regression in spectral clusters from Landsat-8 to Sentinel-2A."""
"""Test quadratic regression including spectral clusters from Landsat-8 to Sentinel-2A."""
predarr, errors = self.SpH.predict_by_machine_learner(
self.testArr_L8,
method='QR', n_clusters=50,
......@@ -295,3 +295,26 @@ class Test_SpectralHomogenizer(unittest.TestCase):
self.assertIsInstance(errors, np.ndarray)
self.assertEqual(errors.shape, (50, 50, 13))
self.assertEqual(errors.dtype, np.int16)
def test_predict_by_machine_learner__RFR_L8_S2(self):
"""Test random forest regression from Landsat-8 to Sentinel-2A."""
predarr, errors = self.SpH.predict_by_machine_learner(
self.testArr_L8,
method='RFR', n_clusters=1,
classif_alg='MinDist',
src_satellite='Landsat-8', src_sensor='OLI_TIRS',
# src_LBA=['1', '2', '3', '4', '5', '6', '7'],
src_LBA=['1', '2', '3', '4', '5', '6', '7'],
tgt_satellite='Sentinel-2A', tgt_sensor='MSI',
tgt_LBA=['1', '2', '3', '4', '5', '6', '7', '8', '8A', '9', '10', '11', '12'],
compute_errors=True,
# compute_errors=False,
nodataVal=-9999)
self.assertIsInstance(predarr, GeoArray)
self.assertEqual(predarr.shape, (50, 50, 13))
self.assertEqual(predarr.dtype, np.int16)
self.assertIsInstance(errors, np.ndarray)
self.assertEqual(errors.shape, (50, 50, 13))
self.assertEqual(errors.dtype, np.int16)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment