Commit b9d2b8fa authored by Michael Rudolf's avatar Michael Rudolf

-reorganization and minor changes

parent 23397565
This directory is only a placeholder for example data. Because the files are usually too big for this repository the data should be imported manually.
\ No newline at end of file
- Feature creation
- SciKit Learn
- Slice Data:
- Each Velocity
- Each Normal Load
- Function that calculates the amount of RAM needed depending on:
- WindowSize=30, StepSizeAsFraction=1, MinimumLengthOfCyclesCovered=3, MinimumWindowsPerCycle=10, MinFeatures=10
- RAM-Proxy
- Iteratively remove experiments
- Output the Number of Windows and which Experiments would be needed
- Add some kind of config which defines the window sizes etc...
......@@ -38,9 +38,10 @@
"file_paths = {\n",
" 'home_office': 'C:/Users/Michael/ownCloud/DocStelle/GitRepos/shear-madness/0-data-preparation/ExampleData/b_5kPa_371-01-27-GB300.h5',\n",
" 'lab': 'C:/Users/M.Rudolf/ownCloud/DocStelle/GitRepos/shear-madness/0-data-preparation/ExampleData/b_5kPa_371-01-27-GB300.h5',\n",
" 'office': '~/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/0-data-preparation/ExampleData/b_5kPa_371-01-27-GB300.h5'\n",
" 'office': '/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/0-data-preparation/ExampleData/b_5kPa_371-01-27-GB300.h5',\n",
" 'office2': '/home/mrudolf/Documents/py_allSets_ML/'\n",
"}\n",
"file_path = file_paths['lab']\n",
"file_path = file_paths['office2']\n",
"preparation.show_h5_contents(file_path)\n"
]
},
......@@ -119,7 +120,7 @@
" setf.create_dataset('lid_disp', data=lid_disp, compression='gzip')\n",
" setf.create_dataset('Fs', data=Fs)\n",
" setf.create_dataset('normal', data=normal)\n",
" preparation._dict_to_hdf5(setf, eqs_new)"
" preparation._dict_to_hdf5(setf, {'eqs': eqs_new})"
]
},
{
......@@ -178,8 +179,8 @@
"# go through each set file and calculate parameters\n",
"for (i,set_file) in enumerate(set_file_list):\n",
" with h5py.File(set_file, 'r') as set_f:\n",
" eqd = set_f['eqd'][()]\n",
" eqf = set_f['eqf'][()]\n",
" eqd = set_f['eqs']['eqd'][()]\n",
" eqf = set_f['eqs']['eqf'][()]\n",
" sz_data = set_f['shear'].shape\n",
" eqd = np.array([x for x in eqd if not np.isnan(x)])\n",
" eqf = np.array([x for x in eqf if not np.isnan(x)])\n",
......@@ -229,6 +230,7 @@
"metadata": {},
"outputs": [],
"source": [
"importlib.reload(preparation)\n",
"# Filter out the omitted datasets\n",
"set_list_new = [x for (i,x) in enumerate(set_file_list) if i not in omit_small_total]\n",
"\n",
......@@ -244,7 +246,7 @@
"\n",
"for set_file in set_list_new:\n",
" # Calculate number of samples and slice\n",
" min_length = min_cycles*np.max(max_dur) \n",
" min_length = min_cycles*np.max(max_dur)\n",
" slice_set = slice(-int(min_length), -1)\n",
" # Create new file name from old set name\n",
" (path, file) = os.path.split(set_file)\n",
......@@ -297,9 +299,7 @@
"metadata": {},
"outputs": [],
"source": [
"with h5py.File(subset_dir_name+'/'+subset_file, 'r') as subset:\n",
" subset['lid_disp'][()]\n",
" "
"preparation.show_h5_contents(subset_dir_name+'/'+subset_file_list[0])"
]
}
],
......@@ -319,7 +319,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.7"
}
},
"nbformat": 4,
......
......@@ -309,19 +309,21 @@ def _slice_hdf5(file_path, slice_set, set_dir_name, fname_set):
with h5py.File(file_path, 'r') as exp:
try:
normal = np.around(np.mean(exp['data']['normal']), decimals=-3)
len_data = len(exp['data']['shear'])
shear = exp['data']['shear'][slice_set]
lid_disp = exp['data']['lid_disp'][slice_set]
eqs = _hdf5group_to_dict(exp['eqs'])
except:
except KeyError:
normal = exp['normal'][()]
len_data = len(exp['shear'])
shear = exp['shear'][slice_set]
lid_disp = exp['lid_disp'][slice_set]
normal = exp['normal'][()]
eqs = dict()
eqs['eqd'] = exp['eqd'][()]
eqs['eqe'] = exp['eqe'][()]
eqs['eqf'] = exp['eqf'][()]
eqs['eqi'] = exp['eqi'][()]
eqs['eqm'] = exp['eqm'][()]
if slice_set.start < 0:
# Change slice for propagation of eqs (relative to absolute)
new_start = len_data+slice_set.start
new_end = len_data+slice_set.stop
slice_set = slice(new_start, new_end)
eqs = _hdf5group_to_dict(exp['eqs'])
Fs = exp['Fs'][()]
# Create a slice for the preprocessed event database (eqs)
......@@ -340,4 +342,4 @@ def _slice_hdf5(file_path, slice_set, set_dir_name, fname_set):
setf.create_dataset('lid_disp', data=lid_disp, compression='gzip')
setf.create_dataset('Fs', data=Fs)
setf.create_dataset('normal', data=normal)
_dict_to_hdf5(setf, eqs_new)
_dict_to_hdf5(setf, {'eqs': eqs_new})
#!/usr/bin/env python3
'''
preparation_main.py
Script to run for data preparation for a folder with sets from multiple
experiments.
__AUTHOR__: Michael Rudolf
__DATE__: 25-Mar-2019
'''
# Import the necessary modules
import importlib
import h5py
import numpy as np
import os
import shutil
import logging
import matplotlib.pyplot as plt
import preparation
importlib.reload(preparation)
# List of file paths, depending on where I run the script
folder_paths = {
'office2': '/home/mrudolf/Documents/py_allSets_ML/'
}
folder = folder_paths['office2']
set_file_list = [folder+f for f in os.listdir(folder) if f.endswith('h5')]
cfg_file_list = [folder+f for f in os.listdir(folder) if f.endswith('ini')]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Workflow\n",
"1. Take window\n",
"2. Detrend\n",
"3. Filter\n",
"4. Feature"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import the necessary modules\n",
"import importlib\n",
"import h5py\n",
"import numpy as np\n",
"import os\n",
"import shutil\n",
"import matplotlib.pyplot as plt\n",
"import inspect\n",
"import multiprocessing as mp\n",
"from multiprocessing import Pool\n",
"\n",
"import filters\n",
"import feature_functions as ffc\n",
"\n",
"importlib.reload(ffc)\n",
"importlib.reload(filters)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Functions to extract windows and create features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def extract_windows(data, window, step_frac, eqs):\n",
" '''\n",
" Extracts windows of data and gives it back as a list of arrays. Also\n",
" generates a numpy array with labels that have been found using peak\n",
" detection.\n",
" '''\n",
"\n",
" # We do a list comprehension to create small windows of data\n",
" step = round(window*step_frac)\n",
" num_its = int(np.floor(len(data)/(window*step_frac)))\n",
" win_data = [data[i:i+step] for i in range(0, num_its*window, step)]\n",
"\n",
" # The labels are generated from an array of the same length as the data.\n",
" labels = np.zeros_like(data)\n",
" # Using binary format makes it easy to add up\n",
" # multiple labels just in case there are multiple in one window\n",
" leg_eqs = {'eqi': int('00001', 2),\n",
" 'eqd': int('00010', 2),\n",
" 'eqm': int('00100', 2),\n",
" 'eqf': int('01000', 2),\n",
" 'eqe': int('10000', 2)}\n",
" # In the array we replace the zeros with the binary label of the event\n",
" for eq in eqs:\n",
" for i in eqs[eq]:\n",
" if not np.isnan(i):\n",
" try:\n",
" labels[int(i)] = leg_eqs[eq]\n",
" except IndexError:\n",
" pass\n",
" # Then we add up all labels in a window, to be able to combine events\n",
" label_data = [np.sum(labels[i:i+step]) for i in range(0,\n",
" num_its*window,\n",
" step)]\n",
"\n",
" return (win_data, label_data)\n",
"\n",
"\n",
"def create_features(window, f_list):\n",
" ''' Uses the functions given in f_list to calculate features '''\n",
" features = np.zeros(len(f_list))\n",
" for (i, fnc) in enumerate(f_list):\n",
" features[i] = fnc[1](window)\n",
" return features\n",
"\n",
" \n",
"def _hdf5group_to_dict(h5group):\n",
" '''\n",
" Returns a dictionary with each element in the h5group.\n",
" '''\n",
" out_dict = dict()\n",
" for dset in h5group.keys():\n",
" out_dict[dset] = h5group[dset][()]\n",
" return out_dict\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Initiate location and extract windows"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# List of file paths, depending on where I run the script\n",
"file_paths = {\n",
" 'home_office': 'C:/Users/Michael/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/',\n",
" 'lab': 'C:/Users/M.Rudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/',\n",
" 'office': '/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/',\n",
" 'office2': '/home/mrudolf/Documents/py_allSets_ML/'\n",
"}\n",
"# Location where the files are\n",
"file_path = file_paths['office2']\n",
"\n",
"# Parameters\n",
"window = 30\n",
"step_frac = 1\n",
"min_cycles = 5\n",
"min_win = 10\n",
"\n",
"win_data = []\n",
"label_data = []\n",
"# Iterate over files and create windowed data\n",
"file_list = [f for f in os.listdir(file_path) if f.endswith('.h5')]\n",
"for (i, file) in enumerate(file_list):\n",
" with h5py.File(file_path+file) as hf:\n",
" shear = hf['shear']\n",
" eqs = _hdf5group_to_dict(hf['eqs'])\n",
" (windows, labels) = extract_windows(shear, window, step_frac, eqs)\n",
" win_data.extend(windows)\n",
" label_data.extend(labels)\n",
" print('%6i windows after %2i files.' % (len(win_data),i+1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Filter the data using 'filters'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"win_filt = [filters.filter_data(window, 60, 625) for window in win_data]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Create Features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%%time\n",
"importlib.reload(ffc)\n",
"# Look into feature_functions to get a list of functions to process with\n",
"f_list = inspect.getmembers(ffc, inspect.isfunction)\n",
"fnames = [entry[0].replace('do_','') for entry in f_list]\n",
"features = [create_features(window, f_list) for window in win_data]\n",
"X = np.array(features)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X = np.array(features)\n",
"Y = np.array(label_data)\n",
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"folder = os.path.split(file_path)[0]\n",
"(out_folder, out_file) = os.path.split(folder)\n",
"out_file = out_file.replace('_subsets', '_features.h5')\n",
"asciiList = [n.encode(\"ascii\", \"ignore\") for n in fnames]\n",
"\n",
"with h5py.File(out_folder+'/'+out_file, 'w') as out_hf:\n",
" out_hf.create_dataset('X', data=X, compression='gzip')\n",
" out_hf.create_dataset('Y', data=Y, compression='gzip')\n",
" out_hf.create_dataset('feature_names', data=asciiList)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"out_folder+'/'+out_file\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fnames"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -2,31 +2,52 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"<module 'feature_generation' from '/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/feature_generation.py'>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import the necessary modules\n",
"import feature_generation as ftg\n",
"import importlib\n",
"import h5py\n",
"import numpy as np\n",
"import os\n",
"import shutil\n",
"import logging\n",
"import matplotlib.pyplot as plt\n",
"\n",
"logger = logging.getLogger()\n",
"logger.setLevel(logging.INFO)\n",
"\n",
"importlib.reload(ftg)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 40.9 s, sys: 113 ms, total: 41 s\n",
"Wall time: 40.9 s\n"
]
}
],
"source": [
"%%time\n",
"# List of file paths, depending on where I run the script\n",
"file_paths = {\n",
" 'home_office': 'C:/Users/Michael/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/',\n",
" 'lab': 'C:/Users/M.Rudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/',\n",
" 'office': '~/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/'\n",
" 'office': '/home/mrudolf/ownCloud/DocStelle/GitRepos/shear-madness/1-feature-generation/ExampleData/b_5kPa_371-01-27-GB300_subsets/'\n",
"}\n",
"file_path = file_paths['lab']\n",
"# Location where the files are\n",
"file_path = file_paths['office']\n",
"\n",
"file_list = [f for f in os.listdir(file_path) if f.endswith('.h5')]"
"ftg.run(file_path)"
]
},
{
......@@ -34,26 +55,7 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for (i, file) in enumerate(file_list):\n",
" with h5py.File(file_path+file) as hf:\n",
" if i < 1:\n",
" shear = hf['shear'][()][:, None]\n",
" else:\n",
" shear = np.hstack([shear, hf['shear'][()][:, None]])\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Workflow\n",
"1. Take window\n",
"2. Detrend\n",
"3. Filter\n",
"4. Feature"
]
"source": []
}
],
"metadata": {
......@@ -72,7 +74,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.7"
}
},
"nbformat": 4,
......
#!/usr/bin/env python3
'''
feature_functions.py
Module containing all feature funtions to be generated.
__AUTHOR__: Jon Bedford
__DATE__: 26-Feb-2019
'''
import numpy as np
import scipy
from scipy import stats
import tsfresh.feature_extraction.feature_calculators as ts
def do_variance(y_in):
return np.var(y_in)
def do_std(y_in):
return np.std(y_in)
def do_skew(y_in):
return stats.skew(y_in)
def do_kurtosis(y_in):
return stats.kurtosis(y_in)
def do_range(y_in):
return np.ptp(y_in)
def do_mean(x):
return np.mean(x)
def do_median(x):
return np.median(x)
def do_maximum(x):
return np.max(x)
def do_minimum(x):
return np.min(x)
def do_integral(x):
return np.sum(np.absolute(x))
def do_interquartile_range_75_25(x):
"""Compute interquartile range. Range in which 50% of the data lies."""
q75, q25 = np.percentile(x, [75, 25])
return q75 - q25
def do_max_diff(x):
return np.max(np.diff(x))
def do_abs_energy(x):
"""Compute engery of waveform. Implementation varies from tsfresh."""
return np.sum(np.square(x))
def do_absolute_sum_of_changes(x):
return np.sum(np.absolute(np.diff(x)))
def do_autocorrelation(x):
lag = int(len(x)/2)
y1 = x[:(len(x)-lag)]
y2 = x[lag:]
# Subtract the mean of the whole series x
x_mean = np.mean(x)
# The result is sometimes referred to as "covariation"
sum_product = np.sum((y1-x_mean)*(y2-x_mean))
# Return the normalized unbiased covariance
return sum_product / ((len(x) - lag) * np.var(x))
def do_binned_entropy(x):
max_bins = 5
hist, bin_edges = np.histogram(x, bins=max_bins)
probs = hist / len(x)
probs = probs[np.nonzero(probs)]
return -np.sum(probs * np.log(probs))
def do_cid_ce(x):
normalize = True
if normalize:
s = np.std(x)
if s != 0:
x = (x - np.mean(x))/s
else:
return 0.0
x = np.diff(x)
return np.sqrt(np.sum((x * x)))
def do_count_above_mean(x):
m = np.mean(x)
return np.where(x > m)[0].shape[0]
def do_count_below_mean(x):
m = np.mean(x)
return np.where(x < m)[0].shape[0]
def do_first_location_of_maximum(x):
return [np.argmax(x) / len(x) if len(x) > 0 else np.NaN]
def do_first_location_of_minimum(x):
return [ts.first_location_of_minimum(x)]
def do_last_location_of_maximum(x):
return [1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0 else np.NaN]
def do_last_location_of_minimum(x):
return [1.0 - np.argmin(x[::-1]) / len(x) if len(x) > 0 else np.NaN]
def do_longest_strike_above_mean(x):
return [ts.longest_strike_above_mean(x)]
def do_longest_strike_below_mean(x):
return [ts.longest_strike_below_mean(x)]
def do_mean_change(x):
return [ts.mean_change(x)]
def do_mean_abs_change(x):
return [np.mean(np.absolute(np.diff(x)))]
def do_mean_second_derivative_central(x):
return [ts.mean_second_derivative_central(x)]
#!/usr/bin/env python3
'''
feature_generation.py
Module that generates all features.
__AUTHOR__: Michael Rudolf
__DATE__: 26-Feb-2019
'''
import configparser
import inspect
import h5py
import numpy as np
import os
import logging
import filters
import feature_functions as ffc
def run(file_path, **kwargs):
'''
Runs a full feature extraction for all h5 files existing in file_path.
If a config file with the same name as the folder exists in the path, it
will use the parameters given in there. You can pass additional keyword
arguments that will be sent to the configparser to specify the parameters
that you want to change.
'''