Commit 19fcd979 authored by Daniel Scheffler's avatar Daniel Scheffler
Browse files

Bugfixes and speedup for MinimumDistance_Classifier.


Signed-off-by: Daniel Scheffler's avatarDaniel Scheffler <danschef@gfz-potsdam.de>
parent a893af35
Pipeline #3887 failed with stage
in 1 minute and 57 seconds
......@@ -10,7 +10,6 @@ from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MaxAbsScaler
from numba import jit
from matplotlib import pyplot as plt
from geoarray import GeoArray
from py_tools_ds.numeric.array import get_array_tilebounds
......@@ -44,11 +43,12 @@ class _ImageClassifier(object):
self.cmap = None
self.clf_name = ''
self._distance_metrics = None
self._cmap_nodataVal = None
def _predict(self, tilepos):
raise NotImplementedError('This method has to be implemented by the subclass.')
def classify(self, image_cube, in_nodataVal=None, cmap_nodataVal=None, tiledims=(250, 250)):
def classify(self, image_cube, in_nodataVal=None, cmap_nodataVal=None, tiledims=(100, 100)):
"""
:param image_cube:
......@@ -57,6 +57,8 @@ class _ImageClassifier(object):
:param tiledims:
:return:
"""
self._cmap_nodataVal = cmap_nodataVal
dtype_cmap = np.array(self.train_labels).dtype
if cmap_nodataVal is not None and not np.can_cast(cmap_nodataVal, dtype_cmap):
dtype_cmap = np.find_common_type(np.array(self.train_labels), np.array([cmap_nodataVal]))
......@@ -74,7 +76,7 @@ class _ImageClassifier(object):
print('Performing %s image classification...' % self.clf_name)
if self.CPUs is None or self.CPUs > 1:
with Pool(self.CPUs, initializer=initializer, initargs=(self.train_spectra, image_cube_gA)) as pool:
tiles_results = pool.map(self._predict, bounds_alltiles)
tiles_results = list(pool.imap_unordered(self._predict, bounds_alltiles))
else:
initializer(self.train_spectra, image_cube_gA)
......@@ -139,14 +141,13 @@ class _ImageClassifier(object):
class MinimumDistance_Classifier(_ImageClassifier):
"""Classifier computing the n-dimensional euclidian distance of each pixel vector to each cluster mean vector.
NOTE: distance equation: D² = sqrt(sum((Xvi - Xvj)²)
NOTE: - distance equation: D² = sqrt(sum((Xvi - Xvj)²)
NOTE: - NearestCentroid parallelizes automatically but as long as the tile size is below 100 x 100,
Python multiprocessing is faster
"""
def __init__(self, train_spectra, train_labels, CPUs=1, **kwargs):
# type: (np.ndarray, Union[np.ndarray, List[int]], Union[int, None], dict) -> None
if CPUs is None or CPUs > 1:
CPUs = 1 # The NearestCentroid seems to parallelize automatically. So using multiprocessing is slower.
super(MinimumDistance_Classifier, self).__init__(train_spectra, train_labels, CPUs=CPUs)
self.clf_name = 'minimum distance (nearest centroid)'
......@@ -159,14 +160,14 @@ class MinimumDistance_Classifier(_ImageClassifier):
def euclidian_distance(self):
return self._distance_metrics
@jit
def compute_euclidian_distance_jit(self, imdata, cmap):
# spectra = im2spectra(imdata)
spectra = imdata.reshape((imdata.shape[0] * imdata.shape[1], imdata.shape[2]))
def compute_euclidian_distance(self, imdata, cmap, nodataVal_cmap):
spectra = im2spectra(imdata)
distances = np.empty(np.dot(*imdata.shape[:2]), np.float32)
labels = cmap.flatten()
for lbl in np.unique(cmap):
if nodataVal_cmap is not None and lbl == nodataVal_cmap:
continue
mask = labels == lbl
centroid = self.class_centroids[list(self.train_labels).index(lbl), :].reshape(1, -1).astype(np.float)
diff = spectra[mask, :] - centroid
......@@ -181,10 +182,9 @@ class MinimumDistance_Classifier(_ImageClassifier):
spectra = tileimdata.reshape((tileimdata.shape[0] * tileimdata.shape[1], tileimdata.shape[2]))
cmap = self.clf.predict(spectra).reshape(*tileimdata.shape[:2])
# dist = self.compute_euclidian_distance_jit(tileimdata.astype(np.float32), cmap)
dist = self.compute_euclidian_distance(tileimdata.astype(np.float32), cmap, self._cmap_nodataVal)
return tilepos, cmap
# return tilepos, cmap, dist
return tilepos, cmap, dist
def label_unclassified_pixels(self, label_unclassified, threshold):
# type: (int, Union[str, int, float]) -> GeoArray
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment