Commit 129faf06 authored by Michael Rudolf's avatar Michael Rudolf

Project update

 - Processing stage is now saved in a config file (*.ini)
- Minor updates to functions
parent 63cdf45e
# ignore the cached variables from running the script
Tests/
.ipynb_checkpoints/
ExampleData_*/
.vscode/
# ignore pictures, pdfs, and data files
*.pdf
......
......@@ -6,7 +6,7 @@ feature_functions.py
Module containing all feature funtions to be generated.
__AUTHOR__: Jon Bedford
__AUTHOR__: Jon Bedford, Michael Rudolf
__DATE__: 26-Feb-2019
'''
......@@ -42,12 +42,12 @@ def do_mean(x):
return np.mean(x)
def do_median(x):
return np.median(x)
# def do_median(x): # Strongly correlates
# return np.median(x)
def do_maximum(x):
return np.max(x)
# def do_maximum(x): # Strongly correlates with mean
# return np.max(x)
def do_minimum(x):
......@@ -121,36 +121,36 @@ def do_count_below_mean(x):
def do_first_location_of_maximum(x):
return [np.argmax(x) / len(x) if len(x) > 0 else np.NaN]
return np.argmax(x) / len(x) if len(x) > 0 else np.NaN
def do_first_location_of_minimum(x):
return [ts.first_location_of_minimum(x)]
return ts.first_location_of_minimum(x)
def do_last_location_of_maximum(x):
return [1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0 else np.NaN]
return 1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0 else np.NaN
def do_last_location_of_minimum(x):
return [1.0 - np.argmin(x[::-1]) / len(x) if len(x) > 0 else np.NaN]
return 1.0 - np.argmin(x[::-1]) / len(x) if len(x) > 0 else np.NaN
def do_longest_strike_above_mean(x):
return [ts.longest_strike_above_mean(x)]
return ts.longest_strike_above_mean(x)
def do_longest_strike_below_mean(x):
return [ts.longest_strike_below_mean(x)]
return ts.longest_strike_below_mean(x)
def do_mean_change(x):
return [ts.mean_change(x)]
return ts.mean_change(x)
def do_mean_abs_change(x):
return [np.mean(np.absolute(np.diff(x)))]
# def do_mean_abs_change(x): # Strongly correlates with mean change
# return np.mean(np.absolute(np.diff(x)))
def do_mean_second_derivative_central(x):
return [ts.mean_second_derivative_central(x)]
return ts.mean_second_derivative_central(x)
......@@ -11,19 +11,24 @@ __DATE__: 26-Feb-2019
'''
# Python Modules
import configparser
import inspect
import h5py
import numpy as np
import os
import logging
import shutil
from tqdm import tqdm
import filters
import feature_functions as ffc
# Custom Modules
import modules.filters as filters
import modules.feature_functions as ffc
import modules.projects as projects
import modules.utils as utils
def run(file_path, **kwargs):
def run(file_path, prj_file):
'''
Runs a full feature extraction for all h5 files existing in file_path.
If a config file with the same name as the folder exists in the path, it
......@@ -32,99 +37,96 @@ def run(file_path, **kwargs):
that you want to change.
'''
# Read Parameters from config, if not existing create default.
name_cfg = file_path.split('/')[-2]+'.ini'
cfg_file = config(file_path=file_path+name_cfg, **kwargs)
# Parameters
window = cfg_file['params'].getint('window')
step_frac = cfg_file['params'].getfloat('step_frac')
lowpass_cutoff = cfg_file['params'].getfloat('lowpass_cutoff')
lowpass_order = cfg_file['params'].getfloat('lowpass_order')
win_data = []
label_data = []
window = prj_file['params'].getint('window')
step_frac = prj_file['params'].getfloat('step_frac')
lowpass_cutoff = prj_file['params'].getfloat('lowpass_cutoff')
lowpass_order = prj_file['params'].getfloat('lowpass_order')
if ~(file_path.endswith('/')):
file_path += '/'
win_data_shear = []
label_data_shear = []
win_data_lid = []
label_data_lid = []
# Iterate over files and create windowed data
file_list = [f for f in os.listdir(file_path) if f.endswith('.h5')]
for (i, file) in enumerate(file_list):
for file in tqdm(file_list, desc='Creating Windows'):
with h5py.File(file_path+file) as hf:
Fs = hf['Fs'][()]
# Extract windows
shear = hf['shear']
eqs = _hdf5group_to_dict(hf['eqs'])
(windows, labels) = extract_windows(shear, window, step_frac, eqs)
win_data.extend(windows)
label_data.extend(labels)
logging.info('%6i windows after %2i files.' % (len(win_data), i+1))
lid = hf['lid_disp']
eqs = utils.hdf5group_to_dict(hf['eqs'])
(windows_shear, labels_shear) = extract_windows(shear,
window,
step_frac,
eqs)
win_data_shear.extend(windows_shear)
label_data_shear.extend(labels_shear)
(windows_lid, labels_lid) = extract_windows(lid,
window,
step_frac,
eqs)
win_data_lid.extend(windows_lid)
label_data_lid.extend(labels_lid)
# Filter the data
win_filt = [filters.filter_data(window,
lowpass_cutoff,
Fs)
for window in win_data]
win_filt_shear = [filters.filter_data(window,
lowpass_cutoff,
Fs)
for window in tqdm(win_data_shear,
desc='Filtering Shear Data')]
win_filt_lid = [filters.filter_data(window,
lowpass_cutoff,
Fs)
for window in tqdm(win_data_shear,
desc='Filtering Lid Data')]
# Look into feature_functions to get a list of functions to process with
f_list = inspect.getmembers(ffc, inspect.isfunction)
feature_names = [entry[0].replace('do_', '') for entry in f_list]
features = [create_features(window, f_list) for window in win_data]
features_shear = [create_features(window, f_list) for window in
tqdm(win_data_shear, desc='Features Shear Data')]
features_lid = [create_features(window, f_list) for window in
tqdm(win_data_lid, desc='Features Lid Data')]
features = np.concatenate(
(np.array(features_shear), np.array(features_lid)),
axis=1
)
feature_names_shear = [f+'_shear' for f in feature_names]
feature_names_lid = [f+'_lid' for f in feature_names]
feature_names_shear = np.array(feature_names_shear).astype('S50')
feature_names_lid = np.array(feature_names_lid).astype('S50')
feature_names = np.concatenate((feature_names_shear, feature_names_lid))
label_data = np.array(label_data_shear)
# Collect examples in a dictionary
feature_dict = {
'features': features,
'labels': label_data,
'feature_names': feature_names
}
return feature_dict
def config(**kwargs):
'''
Tries to read a config from the given file path. If called without any
arguments a new config is created in the current path using standard
entries. You can change default values and add new values to the config by
passing keyword arguments.
Examples:
config()
- Creates a default config with the name 'default_parameters.ini'
config(file_path='foo.ini')
- Tries to open the config 'foo.ini' if it does not exist, it is
created.
config(window=45)
- Creates a default config with the entry 'window' set to 45.
'''
cfg = configparser.ConfigParser()
if 'file_path' in kwargs:
try:
with open(kwargs['file_path']) as f:
cfg.read_file(f)
except Exception as e:
create_default_config(kwargs['file_path'])
with open(kwargs['file_path']) as f:
cfg.read_file(f)
else:
cfg_path = create_default_config()
with open(cfg_path) as f:
cfg.read_file(f)
return cfg
def create_default_config(file_path='default_parameters.ini'):
''' Contains the default entries for the config '''
cfg = configparser.ConfigParser()
cfg['params'] = {
'window': 30,
'step_frac': 1,
'min_cycles': 5,
'min_win': 10,
'lowpass_cutoff': 60,
'lowpass_order': 5
}
with open(file_path, 'w') as cfg_file:
cfg.write(cfg_file)
return file_path
# Change and save project state and created features
subset_folder = prj_file['state']['subset_folder']
feature_folder = subset_folder.replace('subsets', 'features')
try:
os.mkdir(feature_folder)
except FileExistsError:
shutil.rmtree(feature_folder, ignore_errors=True)
os.mkdir(feature_folder)
feature_name = prj_file['params']['file_path'].split('/')[-1]
feature_name = feature_name.replace('.prj', '.h5')
feature_file = feature_folder+feature_name
with h5py.File(feature_file, 'w') as hf:
utils.dict_to_hdf5(hf, feature_dict)
prj_file['state']['features_created'] = 'True'
prj_file['state']['feature_file'] = feature_file
projects.save_project(prj_file)
return feature_dict
def extract_windows(data, window, step_frac, eqs):
......@@ -137,7 +139,8 @@ def extract_windows(data, window, step_frac, eqs):
# We do a list comprehension to create small windows of data
step = round(window*step_frac)
num_its = int(np.floor(len(data)/(window*step_frac)))
win_data = [data[i:i+step] for i in range(0, num_its*window, step)]
win_data = [data[i:i+step] for i in
range(0, num_its*window, step)]
# The labels are generated from an array of the same length as the data.
labels = np.zeros_like(data)
......@@ -158,9 +161,8 @@ def extract_windows(data, window, step_frac, eqs):
pass
# Then we add up all labels in a window, to be able to combine events
label_data = [int(np.sum(labels[i:i+step])) for i in range(0,
num_its*window,
step)]
label_data = [int(np.sum(labels[i:i+step])) for i in
range(0, num_its*window, step)]
return (win_data, label_data)
......@@ -169,16 +171,5 @@ def create_features(window, f_list):
''' Uses the functions given in f_list to calculate features '''
features = np.zeros(len(f_list))
for (i, fnc) in enumerate(f_list):
print(fnc)
features[i] = fnc[1](window)
return features
def _hdf5group_to_dict(h5group):
'''
Returns a dictionary with each element in the h5group.
'''
out_dict = dict()
for dset in h5group.keys():
out_dict[dset] = h5group[dset][()]
return out_dict
......@@ -12,28 +12,26 @@ __DATE__: 13-Feb-2019
'''
# Imports
# Python Modules
import numpy as np
import h5py
import logging
from ete3 import Tree
import os
import shutil
import logging
import matplotlib.pyplot as plt
# Custom Modules
import modules.utils as utils
import modules.projects as projects
# Public Functions
def show_h5_contents(cur_file):
''' Displays the content of a h5file in a structured format '''
with h5py.File(cur_file, 'r') as exp:
tree_string = _extract_keys_asTree(exp)
tree = Tree(tree_string+';', format=1)
print(tree.get_ascii(show_internal=True))
def create_sets(file_path):
def create_sets(file_path, prj_file=None):
''' Separates the input file into individual sets of the same velocity '''
if not prj_file:
prj_name = file_path.replace('.h5', '.ini')
prj_file = projects.project(file_path=prj_name)
# Read some essential info from file
with h5py.File(file_path, 'r') as exp:
vel_steps = exp['vel_info']['vel_steps'][()]
......@@ -59,15 +57,25 @@ def create_sets(file_path):
slice_set = slice(vel_int[i], vel_int[i+1])
set_file_list.append(set_dir_name+'/'+fname_set)
# Slice the h5 file
_slice_hdf5(file_path, slice_set, set_dir_name, fname_set)
utils.slice_hdf5(file_path, slice_set, set_dir_name, fname_set)
# Change and save project state in project file
prj_file['state']['sets_created'] = 'True'
prj_file['state']['set_folder'] = set_dir_name+'/'
with open(prj_file['params']['file_path'], 'w') as cfg_file:
prj_file.write(cfg_file)
return set_file_list
def create_subsets(set_list_new, max_dur, min_cycles=5):
def create_subsets(set_list_new, prj_file, max_dur, min_cycles=5):
''' Creates equally long subsets from the list of set files given '''
# Create subset folder, or delete and recreate if it exists
(path, file) = os.path.split(set_list_new[0])
subset_dir_name = path.replace('_sets', '_subsets')
if path.endswith('_sets'):
subset_dir_name = path.replace('_sets', '_subsets/')
else:
subset_dir_name = path+'_subsets/'
try:
os.mkdir(subset_dir_name)
except FileExistsError:
......@@ -87,10 +95,16 @@ def create_subsets(set_list_new, max_dur, min_cycles=5):
subset_file += a+'_'
subset_file = subset_file.replace('.h5_', '.h5')
# Slice the data
_slice_hdf5(set_file,
slice_set,
subset_dir_name,
subset_file)
utils.slice_hdf5(set_file,
slice_set,
subset_dir_name,
subset_file)
# Change and save project state in project file
prj_file['state']['subsets_created'] = 'True'
prj_file['state']['subset_folder'] = subset_dir_name
with open(prj_file['params']['file_path'], 'w') as cfg_file:
prj_file.write(cfg_file)
return subset_dir_name
......@@ -160,8 +174,8 @@ def omit_sets(set_file_list,
# go through each set file and calculate parameters
for (i, set_file) in enumerate(set_file_list):
with h5py.File(set_file, 'r') as set_f:
eqd = set_f['eqd'][()]
eqf = set_f['eqf'][()]
eqd = set_f['eqs']['eqd'][()]
eqf = set_f['eqs']['eqf'][()]
sz_data = set_f['shear'].shape
eqd = np.array([x for x in eqd if not np.isnan(x)])
eqf = np.array([x for x in eqf if not np.isnan(x)])
......@@ -179,13 +193,16 @@ def omit_sets(set_file_list,
if len(omit_samples) > 0:
logging.info('Sets to omit because of minimum cycle length: '
+ str(omit_samples))
logging.info('To avoid lower minimum number of windows, window size, or step size.')
logging.info('To avoid lower minimum number of windows, window size, \
or step size.')
if len(omit_length) > 0:
logging.info('Sets to omit because of minimum set length: '
+ str(omit_length))
logging.info('To avoid remove the set with the longest cycles from the data.')
logging.info('To avoid remove the set with the longest cycles from the\
data.')
if len(omit_total) > 0:
logging.info('In total these sets have to be omitted: '+str(omit_total))
logging.info('In total these sets have to be omitted: '
+ str(omit_total))
return omit_total, max_dur
......@@ -245,101 +262,3 @@ def visualize_subsets(subset_dir_name):
fig.tight_layout()
fig.show()
fig.savefig(subset_dir_name+'/OverviewOfSubsets')
# Internal Functions
def _extract_keys_asTree(f):
''' Generates a string for use with the ete3 TreeObject '''
tree_string = '('
for key in f.keys():
add = ''
if f[key].__class__ == h5py._hl.group.Group:
tree_string += _extract_keys_asTree(f[key])
else:
typ = str(f[key].dtype)
shp = str(f[key].shape)
if shp.endswith('()') or shp.endswith(',)'):
shp = shp[0:-1]+'0)'
shp = shp.replace(',', 'x')
add = '('+shp+'--->'+typ+')'
tree_string += add+'--'+key+','
tree_string = tree_string[0:-1]+')'
return tree_string
def _hdf5group_to_dict(h5group):
'''
Returns a dictionary with each element in the h5group.
'''
out_dict = dict()
for dset in h5group.keys():
out_dict[dset] = h5group[dset][()]
return out_dict
def _dict_to_hdf5(h5_file, dicty):
'''
Recursive function saving a dictionary into a hdf5 file. If data is present
as a numpy array, the data is automatically compressed and chunked using
h5py's standard settings.
'''
for (name, content) in dicty.items():
if content.__class__.__name__ == 'dict':
grp = h5_file.create_group(name)
_dict_to_hdf5(grp, content)
elif content.__class__.__name__ == 'ndarray':
h5_file.create_dataset(name, data=content, compression="gzip")
else:
h5_file.create_dataset(name, data=content)
return h5_file
def _slice_hdf5(file_path, slice_set, set_dir_name, fname_set):
'''
Slices the data in a h5 file and saves it in a new folder
Keywords:
file_path -- The path to the input h5 file
slice_set -- A slice() object which defines where to cut the data
set_dir_name -- The new folder where the sliced file has to go
fname_set -- The name of the output h5 file
'''
# Read out actual data from file
with h5py.File(file_path, 'r') as exp:
try:
normal = np.around(np.mean(exp['data']['normal']), decimals=-3)
len_data = len(exp['data']['shear'])
shear = exp['data']['shear'][slice_set]
lid_disp = exp['data']['lid_disp'][slice_set]
except KeyError:
normal = exp['normal'][()]
len_data = len(exp['shear'])
shear = exp['shear'][slice_set]
lid_disp = exp['lid_disp'][slice_set]
if slice_set.start < 0:
# Change slice for propagation of eqs (relative to absolute)
new_start = len_data+slice_set.start
new_end = len_data+slice_set.stop
slice_set = slice(new_start, new_end)
eqs = _hdf5group_to_dict(exp['eqs'])
Fs = exp['Fs'][()]
# Create a slice for the preprocessed event database (eqs)
eq = eqs['eqi']
start = np.searchsorted(eq, slice_set.start, 'left')
end = np.searchsorted(eq, slice_set.stop, 'right')
slice_eqs = slice(start, end)
# Create a new database, now referenced to the starting point of the set
eqs_new = dict()
for eqn in eqs.keys():
eqs_new[eqn] = eqs[eqn][slice_eqs]-slice_set.start
# Store in set file
with h5py.File(set_dir_name+'/'+fname_set, 'w') as setf:
setf.create_dataset('shear', data=shear, compression='gzip')
setf.create_dataset('lid_disp', data=lid_disp, compression='gzip')
setf.create_dataset('Fs', data=Fs)
setf.create_dataset('normal', data=normal)
_dict_to_hdf5(setf, {'eqs': eqs_new})
#!/usr/bin/env python3
'''
projects.py
Project management by assigning an individual config to each project.
__AUTHOR__: Michael Rudolf
__DATE__: 25-Mar-2019
'''
import configparser
def project(**kwargs):
'''
Tries to read a project from the given file path. If called without any
arguments a new project is created in the current path using standard
entries. You can change default values and add new values to the project by
passing keyword arguments.
Examples:
project()
- Creates a default project with the name 'default_parameters.ini'
project(file_path='foo.ini')
- Tries to open the project 'foo.ini' if it does not exist, it is
created.
project(window=45)
- Creates a default project with the entry 'window' set to 45.
'''
prj = configparser.ConfigParser()
if 'file_path' in kwargs:
prj_path = kwargs['file_path']
try:
with open(prj_path) as f:
prj.read_file(f)
except Exception as e:
prj = create_default_project(file_path=prj_path)
else:
prj = create_default_project()
return update_project(prj, **kwargs)
def create_default_project(file_path='default_parameters.prj'):
''' Contains the default entries for the project '''
prj = configparser.ConfigParser()
prj['params'] = {
'directory': '',
'file_path': file_path,
'lowpass_cutoff': 150,
'lowpass_order': 5,
'min_cycles': 5,
'min_win': 10,
'step_frac': 1,
'window': 30,
}
prj['state'] = {
'feature_file': '',
'features_created': False,
'set_folder': '',
'sets_created': False,
'subset_folder': '',
'subsets_created': False,
}
return save_project(prj)
def update_project(prj, **kwargs):
for k in kwargs.keys():
prj['params'][k] = kwargs[k]
return save_project(prj)
def save_project(prj):
with open(prj['params']['file_path'], 'w') as prj_file:
prj.write(prj_file)
return read_project(prj)
def read_project(prj):
with open(prj['params']['file_path'], 'rt') as prj_file:
prj.read_file(prj_file)
return prj
#!/usr/bin/env python3
'''
utils.py
Contains helper functions for various purposes.
__AUTHOR__: Michael Rudolf
__DATE__: 25-Mar-2019
'''
# Python Modules
from ete3 import Tree
import h5py
import numpy as np
def show_h5_contents(cur_file):
''' Displays the content of a h5file in a structured format '''
with h5py.File(cur_file, 'r') as exp:
tree_string = extract_keys_asTree(exp)
tree = Tree(tree_string+';', format=1)
print(tree.get_ascii(show_internal=True))
def extract_keys_asTree(f):
''' Generates a string for use with the ete3 TreeObject '''
tree_string = '('
for key in f.keys():
add = ''
if f[key].__class__ == h5py._hl.group.Group:
tree_string += extract_keys_asTree(f[key])
else:
typ = str(f[key].dtype)
shp = str(f[key].shape)
if shp.endswith('()') or shp.endswith(',)'):
shp = shp[0:-1]+'0)'
shp = shp.replace(',', 'x')
add = '('+shp+'--->'+typ+')'
tree_string += add+'--'+key+','
tree_string = tree_string[0:-1]+')'
return tree_string
def hdf5group_to_dict(h5group):
'''
Returns a dictionary with each element in the h5group.
'''
out_dict = dict()
for dset in h5group.keys():
out_dict[dset] = h5group[dset][()]
return out_dict
def dict_to_hdf5(h5_file, dicty):
'''
Recursive function saving a dictionary into a hdf5 file. If data is present
as a numpy array, the data is automatically compressed and chunked using
h5py's standard settings.
'''
for (name, content) in dicty.items():
if content.__class__.__name__ == 'dict':
grp = h5_file.create_group(name)
dict_to_hdf5(grp, content)
elif content.__class__.__name__ == 'ndarray':
h5_file.create_dataset(name, data=content, compression="gzip")
else:
h5_file.create_dataset(name, data=content)
return h5_file