Commit 7065a25c authored by Marius Kriegerowski's avatar Marius Kriegerowski

Merge branch 'real_data'

parents 4d4e3549 c6967e5d
......@@ -4,8 +4,10 @@ import tensorflow as tf
from pyrocko.io import save, load
from pyrocko.model import load_stations
from pyrocko.guts import Object, String, Int, Float, Tuple
from pyrocko.gui import marker
from pyrocko.gf.seismosizer import Engine, Target
from pyrocko import orthodrome
from pyrocko import pile
from swarm import synthi
import logging
......@@ -16,6 +18,7 @@ import glob
from .tf_util import _FloatFeature, _Int64Feature, _BytesFeature
pjoin = os.path.join
EPSILON = 1E-4
class Noise(Object):
......@@ -28,32 +31,82 @@ class Noise(Object):
def get_chunk(self, n_channels, n_samples):
...
class WhiteNoise(Noise):
def get_chunk(self, n_channels, n_samples):
return num.random.random((n_channels, n_samples)).astype(num.float32) * self.level
class DataGeneratorBase(Object):
class DataGenerator(Object):
_shape = None
fn_stations = String.T()
need_same_dimensions = True # bin mir nicht sicher, ob das sein muss...
effective_deltat = Float.T(optional=True)
n_samples_max = Int.T(default=3500, help='everything longer than this will be dropped')
reference_target = Target.T(
default=Target(
codes=('', 'NKC', '', 'SHZ'),
lat=50.2331,
lon=12.448,
elevation=546))
noise = Noise.T(optional=True, help='Add noise to your feature chunks')
def __init__(self, *args, **kwargs):
super(DataGeneratorBase, self).__init__(*args, **kwargs)
super(DataGenerator, self).__init__(*args, **kwargs)
self.classes = [
#
# TODO
# https://hanxiao.github.io/2017/07/07/Get-10x-Speedup-in-Tensorflow-Multi-Task-Learning-using-Python-Multiprocessing/
# 'north_shift', 'east_shift', 'depth', 'strike', 'dip', 'rake']
'north_shift', 'east_shift', 'depth']
self.n_classes = len(self.classes)
self.setup()
def setup(self):
pass
def extract_labels(self, source):
return [getattr(source, classname) for classname in self.classes]
def get_raw_data_chunk(self):
'''
Return an array of size (Nchannels x Nsamples_max).
When working with noisy data, replace this function.
'''
stencil = num.zeros(self.tensor_shape, dtype=num.float32)
if self.noise is not None:
stencil += self.noise.get_chunk(*self.tensor_shape)
return stencil
def attach_graph(self, node):
'''
Use this method to attach any preprocessing to be done in tensorflow
graph.
'''
return node
def regularize_deltat(self, tr):
'''Equalize sampling rates accross the data set according to sampling rate
set in `self.config`.'''
if abs(tr.deltat - self.effective_deltat)>0.0001:
tr.resample(self.effective_deltat)
def fit_data_into_chunk(self, traces, chunk, indices=None, tref=0):
indices = indices or range(len(traces))
for i, tr in zip(indices, traces):
data_len = len(tr.data)
istart_trace = int((tr.tmin - tref) / tr.deltat)
istart_array = max(istart_trace, 0)
istart_trace = max(-istart_trace, 0)
istop_array = istart_array + (data_len - 2* istart_trace)
ydata = tr.data[ \
istart_trace: min(data_len, self.n_samples_max-istart_array)+istart_trace]
chunk[i, istart_array: istart_array+ydata.shape[0]] += ydata
@property
def tensor_shape(self):
return self._shape
......@@ -84,7 +137,74 @@ class DataGeneratorBase(Object):
self.writer.write(ex.SerializeToString())
class TFRecordData(DataGeneratorBase):
class PileData(DataGenerator):
data_path = String.T()
data_format = String.T(default='mseed')
fn_markers = String.T()
deltat_want = Float.T(optional=True)
def setup(self):
# TODO convert to north_shift, east_shift accroding to seismosizer data grid
self.classes = ['lat', 'lon', 'depth']
self.data_pile = pile.make_pile(self.data_path, fileformat=self.data_format)
markers = marker.load_markers(self.fn_markers)
marker.associate_phases_to_events(markers)
markers.sort(key=lambda x: x.tmin)
markers_by_nsl = {}
for m in markers:
if not m.match_nsl(self.reference_target.codes[:3]):
continue
key = m.one_nslc()[:3]
_ms = markers_by_nsl.get(key, [])
_ms.append(m)
markers_by_nsl[key] = _ms
assert(len(markers_by_nsl) == 1)
self.markers = list(markers_by_nsl.values())[0]
self.deltat_want = self.deltat_want or min(self.data_pile.deltats.keys())
self.channels = list(self.data_pile.nslc_ids.keys())
self.tensor_shape = (len(self.channels), self.n_samples_max)
def check_inputs(self):
if len(self.data_pile.deltats()) > 1:
logging.warn(
'Different sampling rates in dataset. Preprocessing slow')
def preprocess(self, tr):
if tr.delta - self.deltat_want < -EPSILON:
tr.resample(self.deltat_want)
elif tr.deltat - self.deltat_want > EPSILON:
tr.downsample_to(self.deltat_want)
def generate(self):
tr_len = self.n_samples_max * self.deltat_want
nslc_to_index = {nslc: idx for idx, nslc in enumerate(self.channels)}
for m in self.markers:
event = m.get_event()
if event is None:
logging.debug('No event: %s' % m)
continue
for trs in self.data_pile.chopper(
tmin=m.tmin, tmax=m.tmin+tr_len, keep_current_files_open=True):
_trs = []
for tr in trs:
tr.data = tr.ydata.astype(num.float)
_trs.append(tr)
trs = _trs
if not len(trs):
logging.debug('no data at tmin=%s' % m.tmin)
continue
chunk = self.get_raw_data_chunk()
indices = [nslc_to_index[tr.nslc_id] for tr in trs]
self.fit_data_into_chunk(trs, chunk, indices=indices, tref=m.tmin)
yield chunk, self.extract_labels(event)
class TFRecordData(DataGenerator):
# NOT TESTED YET
def __init__(self, input_fn, *args, **kwargs):
......@@ -98,47 +218,20 @@ class TFRecordData(DataGeneratorBase):
return tf.python_io.tf_record_iterator(self.input_fn)
class OnTheFlyData(DataGeneratorBase):
fn_stations = String.T()
class OnTheFlyData(DataGenerator):
gf_engine = Engine.T()
n_sources = Int.T(default=100)
n_samples_max = Int.T(default=3500, help='everything longer than this will be dropped')
onset_phase = String.T(default='p')
reference_target = Target.T(
default=Target(
codes=('', 'NKC', '', 'SHZ'),
lat=50.2331,
lon=12.448,
elevation=546))
quantity = String.T(default='velocity')
tpad = Float.T(default=1., help='padding between p phase onset and data chunk start')
noise = Noise.T(optional=True)
def setup(self):
self.classes = [
#
# TODO
# https://hanxiao.github.io/2017/07/07/Get-10x-Speedup-in-Tensorflow-Multi-Task-Learning-using-Python-Multiprocessing/
# 'north_shift', 'east_shift', 'depth', 'strike', 'dip', 'rake']
'north_shift', 'east_shift', 'depth']
self.n_classes = len(self.classes)
stations = load_stations(self.fn_stations)
self.targets = synthi.guess_targets_from_stations(
stations, quantity=self.quantity)
self.tensor_shape = (len(self.targets), self.n_samples_max)
def get_raw_data_chunk(self):
'''
Return an array of size (Nchannels x Nsamples_max).
When working with noisy data, replace this function.
'''
stencil = num.zeros(self.tensor_shape, dtype=num.float32)
if self.noise is not None:
stencil += self.noise.get_chunk(*self.tensor_shape)
return stencil
def make_data_chunk(self, source, results, store):
ydata_stacked = self.get_raw_data_chunk()
tref = store.t(
......@@ -148,28 +241,14 @@ class OnTheFlyData(DataGeneratorBase):
)
tref += (source.time - self.tpad)
for iresult, result in enumerate(results):
tr = result.trace
data_len = len(tr.data)
istart_trace = int((tr.tmin - tref) / tr.deltat)
istart_array = max(istart_trace, 0)
istart_trace = max(-istart_trace, 0)
istop_array = istart_array + (data_len - 2* istart_trace)
ydata = tr.data[ \
istart_trace: min(data_len, self.n_samples_max-istart_array)+istart_trace]
ydata_stacked[iresult, istart_array: istart_array+ydata.shape[0]] += ydata
traces = [result.trace for result in results]
self.fit_data_into_chunk(traces, ydata_stacked, tref=tref)
ydata_stacked -= num.min(ydata_stacked)
ydata_stacked /= num.max(ydata_stacked)
return ydata_stacked
def extract_labels(self, source):
return [getattr(source, classname) for classname in self.classes]
def generate(self):
swarm = synthi.setup(self.gf_engine, self.n_sources)
......
......@@ -15,7 +15,7 @@ logger = logging.getLogger('pinky.model')
class Model(Object):
data_generator = OnTheFlyData.T()
data_generator = DataGenerator.T()
dropout_rate = Float.T(default=0.1)
batch_size = Int.T(default=10)
outdir = String.T(default='/tmp/dnn-seis')
......@@ -45,6 +45,8 @@ class Model(Object):
self.data_generator.generate_output_types,
output_shapes=shape)
dataset = self.data_generator.attach_graph(dataset)
dataset = dataset.batch(self.batch_size)
dataset = dataset.repeat()
dataset = dataset.prefetch(buffer_size=self.batch_size)
......
--- !pinky.model.Model
data_generator: !pinky.data.PileData
fn_stations: stations.pf
data_path: /media/usb0/vogtland/gse2
data_format: gse2
n_samples_max: 1500
reference_target: !pyrocko.gf.targets.Target
lat: 50.2331
lon: 12.448
depth: 0.0
codes: ['', NKC, '', SHZ]
elevation: 546.0
interpolation: nearest_neighbor
fn_markers: markers_phases_events_nkc_P.pf
dropout_rate: 0.1
batch_size: 10
auto_clear: true
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment