tdms.py 33 KB
Newer Older
1 2 3 4 5 6 7 8
import logging
import datetime
import os
import struct
from obspy import UTCDateTime
import numpy as np
from math import floor
from math import ceil
9
from numbers import Number
10 11


12 13 14 15 16 17 18 19 20 21 22 23
def tup2time(fraction, seconds):
    # logs = logging.getLogger('tup2time')
    # logs.debug('seconds: %s' % seconds)
    # logs.debug('fraction: %s' % fraction)

    dt1904 = datetime.datetime(1904, 1, 1)
    delta = seconds + fraction * 2**(-64)
    result = dt1904 + datetime.timedelta(seconds=delta)
    # logs.debug('Date-time %s' % result)
    return result


24 25 26 27 28 29 30 31
class PotentialGap(Exception):
    pass


class NoData(Exception):
    pass


32
class TDMS(object):
33
    """Class to read and export seismic waveforms in TDMS format"""
34

35
    def __exit__(self, exc_type, exc_val, exc_tb):
36
        """Method which close open resources after using the syntax 'with object:' and use it inside"""
37 38 39
        if self.__fi is not None:
            self.__fi.close()

40 41 42 43 44
    def __enter__(self):
        """Method which allows to use the syntax 'with object:' and use it inside"""

        # Create a buffer space to store the signal coefficients to be
        # convoluted during the decimation
45 46
        # for channel in range(self.__chstart, self.__chstop + 1, self.__chstep):
        for channel in self.__channels:
47 48 49 50 51
            logging.debug('Create empty buffers')
            self.__buffer[channel] = None

        return self

52
    def __init__(self, filename, directory='.', chstart=0, chstop=None, chstep=1, channels=None,
53
                 starttime=None, endtime=None, iterate='D', decimate=1, firfilter='fir235',
54
                 loglevel='INFO'):
55 56 57 58 59 60 61 62 63 64 65 66
        """Initialize the TDMS object selecting the data, channels and decimation

        :param filename: Experiment to read and process. Usually the first part of the filenames
        :type filename: str
        :param directory: Directory where files are located
        :type directory: str
        :param chstart: First channel to select
        :type chstart: int
        :param chstop: Last channel to select
        :type chstop: int
        :param chstep: Step between channels in the selection
        :type chstep: int
67 68
        :param channels: Selection of channels to work with (list of integers)
        :type channels: list
69
        :param starttime: Start of the selected time window
70
        :type starttime: datetime.datetime
71
        :param endtime: End of the selected time window
72
        :type endtime: datetime.datetime
73 74 75 76 77 78 79 80 81
        :param iterate: Select either Data (D) or Metadata (M)
        :type iterate: str
        :param decimate: Factor by which the sampling rate is lowered by decimation
        :type decimate: int
        :param firfilter: Filter to apply in case of decimation (fir235 is the only option)
        :type firfilter: str
        :param loglevel: Verbosity in the output
        :type loglevel: str
        """
82 83 84

        # Log level
        self.__loglevel = loglevel
85 86 87
        logs = logging.getLogger('Init TDMS')
        logs.setLevel(loglevel)

88
        project_dir = os.path.dirname(__file__)
89

90
        # Decimation factor
91 92
        self.__decimate = decimate

93
        # Selection of channels
94 95 96 97 98 99 100
        if not isinstance(chstart, Number) or not isinstance(chstep, Number):
            logs.error('chstart and chstep must be numbers and preferably ints')
            raise TypeError('chstart and chstep must be numbers and preferably ints')

        if not isinstance(chstop, Number) and chstop is not None:
            logs.error('chstop must be a number or None')
            raise TypeError('chstop must be a number or None')
101 102 103 104 105

        self.__chstart = chstart
        self.__chstop = chstop
        self.__chstep = chstep

106
        # channels has priority. Otherwise, chstart, chstop and chstep are used
107 108 109 110 111
        if channels is not None and isinstance(channels, list):
            self.__channels = channels
            # List of channels cannot be empty
            if not len(self.__channels):
                raise Exception('Channel list is empty!')
112

113 114 115 116 117
        else:
            # If chstart, chstop and chstep will define the selected channels we need to keep the three
            #  values (start, stop, step) and define the channels later in readmetadata
            self.__channels = None
            # self.__channels = list(range(chstart, chstop+1, chstep))
118

119
        # Time window selection
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
        self.__twstart = starttime
        self.__twend = endtime

        # Available time window
        self.starttime = None
        self.endtime = None

        # Sampling Rate
        self.sampling_rate = None

        # File currently being processed
        self.__currentfile = None

        # Name of file
        self.__filename = filename
        self.__directory = directory
136 137

        # List of available files in the directory
138 139 140 141 142 143 144 145 146 147 148
        self.__available = list()

        for file in sorted(os.listdir(directory)):
            if not file.startswith(filename):
                continue

            dt = datetime.datetime.strptime(file[len(filename):-len('.tdms')], '_%Z_%Y%m%d_%H%M%S.%f')
            self.__available.append({'dt': dt, 'name': file, 'samples': None})
            if self.__twstart is None:
                self.__twstart = dt

149
        # Set the time window selection to the minimum datetime found
150 151 152 153 154 155 156 157 158 159
        if self.__twstart < self.__available[0]['dt']:
            self.__twstart = self.__available[0]['dt']

        # Keep the values in case we need to reset them
        self.__origstarttime = self.__twstart
        self.__origendtime = self.__twend

        # What should we iterate? D: Data; M: Metadata
        self.iterate = iterate

160 161 162 163 164
        # Define a buffer to store the window over the signal to be
        # convoluted with the FIR filter wile decimating
        # Keys will be channel number and the values will be np.arrays
        self.__buffer = dict()

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
        # Datatype of each channel
        self.__datatypes = dict()

        # Dictionary to save the metadata defined in the file
        self.metadata = dict()

        # Initialization of local variables
        self.__HEADERLEN = 28
        self.__MAXSAMPLES = 30000
        self.__FF64b = 0xFFFFFFFFFFFFFFFF
        self.__FF32b = 0xFFFFFFFF

        self.__data2mask = {
            0: ('c', 1),  # tdsTypeVoid
            1: ('b', 1),  # tdsTypeI8
            2: ('h', 2),  # tdsTypeI16
            3: ('i', 4),  # tdsTypeI32
            4: ('q', 8),  # tdsTypeI64
            5: ('b', 1),  # tdsTypeU8
            6: ('h', 2),  # tdsTypeU16
            7: ('i', 4),  # tdsTypeU32
            8: ('q', 8),  # tdsTypeU64
            9: ('f', 4),  # tdsTypeSingleFloat
            10: ('d', 8),  # tdsTypeDoubleFloat
            0x20: ('I', 4),  # tdsTypeString
            0x21: ('?', 1),  # tdsTypeBoolean
            0x44: ('Qq', 16)  # tdsTypeTimeStamp
        }
193

194
        # Read filter to decimate
195
        auxfilter = list()
196
        with open(os.path.join(project_dir, 'data/filters/%s.txt' % firfilter)) as fin:
197 198 199 200
            for line in fin.readlines():
                auxfilter.append(float(line))

        self.__filter = np.array(auxfilter)
201
        logs.debug('FIR filter: %s' % self.__filter)
202 203 204 205 206
        #     tdsTypeFixedPoint = 0x4F,
        #     tdsTypeComplexSingleFloat = 0x08000c,
        #     tdsTypeComplexDoubleFloat = 0x10000d,
        #     tdsTypeDAQmxRawData = 0xFFFFFFFF

207 208 209 210 211 212 213 214 215 216 217
        # Other variables to be read from the headers
        self.__hasInterleavedData = None
        self.__endian = None
        self.__datatype = None
        self.__datatypesize = None
        self.numchannels = None
        self.__samples = None
        self.__samplestart = None
        self.__sampleend = None
        self.__samplecur = None

218 219 220 221 222
        try:
            self.__search_data()
        except IndexError:
            raise NoData()

223 224 225 226
    def __select_file(self):
        logs = logging.getLogger('Select file')
        logs.setLevel(self.__loglevel)

227 228
        # print('select', self.__twstart, self.starttime, self.__currentfile)

229 230
        if self.__currentfile is None:
            for idx, fi in enumerate(self.__available):
231
                # print(self.__twstart, fi['dt'])
232 233 234 235 236
                if self.__twstart < fi['dt']:
                    if not idx:
                        raise Exception('Data not available in the specified time window')
                    filename = os.path.join(self.__directory, self.__available[idx-1]['name'])
                    self.__currentfile = idx-1
237 238 239
                    # print(self.__currentfile, filename)
                    logs.debug(
                        'Opening %s; Starttime: %s' % (self.__available[self.__currentfile]['name'], self.__twstart))
240 241 242 243 244 245 246 247 248 249 250 251 252
                    break
            else:
                raise Exception('Data not available in the specified time window')
        elif self.__currentfile >= len(self.__available):
            logs.debug('Last file already processed')
            # No more data to iterate
            raise IndexError
        else:
            filename = os.path.join(self.__directory, self.__available[self.__currentfile]['name'])
            self.__twstart = self.__available[self.__currentfile]['dt']
            if (self.__twend is not None) and (self.__twstart > self.__twend):
                logs.debug('Start is greater than end. %s %s' % (self.__twstart, self.__twend))
                raise IndexError
253
            logs.debug('Opening %s; Starttime: %s' % (self.__available[self.__currentfile]['name'], self.__twstart))
254

255
        # print(self.__twstart)
256 257
        # Reset some properties before opening the new file
        self.starttime = self.__available[self.__currentfile]['dt']
258
        # print(self.starttime)
259 260 261
        self.endtime = None
        self.metadata = dict()

262
        # Save the handle of the open file to be processed
263 264
        self.__fi = open(filename, 'rb')

265
        # Read headers
266 267 268
        leadin = self.__fi.read(self.__HEADERLEN)
        (tag, ToCmask) = struct.unpack('<4si', leadin[:8])

269 270 271 272 273 274 275 276 277 278 279 280
        ktocmetadata = 1 << 1
        ktocnewobjlist = 1 << 2
        ktocrawdata = 1 << 3
        ktocinterleaveddata = 1 << 5
        ktocbigendian = 1 << 6
        ktocdaqmxrawdata = 1 << 7

        hasmetadata = bool(ToCmask & ktocmetadata)
        hasnewobjects = bool(ToCmask & ktocnewobjlist)
        hasrawdata = bool(ToCmask & ktocrawdata)
        self.__hasInterleavedData = bool(ToCmask & ktocinterleaveddata)
        hasdaqmxrawdata = bool(ToCmask & ktocdaqmxrawdata)
281 282

        # All input from now on will be formatted by this
283
        self.__endian = '>' if ToCmask & ktocbigendian else '<'
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298

        if tag.decode() != 'TDSm':
            raise Exception('Tag is not TDSm!')

        (versionTDMS, self.__segmentOffset, self.__dataOffset) = \
            struct.unpack('%ciQQ' % self.__endian, leadin[8:])

        logs.debug((tag, ToCmask, versionTDMS, self.__segmentOffset, self.__dataOffset))

        if versionTDMS != 4713:
            logs.warning('Version number is not 4713!')

        if self.__segmentOffset == self.__FF64b:
            logs.error('Severe problem while writing data (crash, power outage)')

299
        if hasmetadata and not self.__dataOffset:
300 301
            logs.error('Flag indicates Metadata but its length is 0!')

302
        if hasdaqmxrawdata:
303 304 305 306 307 308
            logs.warning('DAQmx raw data is still not supported!')

        # Absolute offsets
        self.__segmentOffset += self.__HEADERLEN
        self.__dataOffset += self.__HEADERLEN

309
        logs.debug('Metadata: ' + ('yes' if hasmetadata else 'no'))
310 311
        logs.debug('Object list: ' + ('yes' if hasnewobjects else 'no'))
        logs.debug('Raw data: ' + ('yes' if hasrawdata else 'no'))
312
        logs.debug('Interleaved data: ' + ('yes' if self.__hasInterleavedData else 'no'))
313
        logs.debug('BigEndian: ' + ('yes' if self.__endian == '<' else 'no'))
314
        logs.debug('DAQmx raw data: ' + ('yes' if hasdaqmxrawdata else 'no'))
315

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
        # self.__readmetadata()

    def __search_data(self):
        """
        :raise: IndexError
        """

        while True:
            # Loop through files until there is nothing else (IndexError)
            self.__select_file()

            # Read the metadata and calculate samples to read
            # Skip to the next file if there is a gap
            try:
                self.__readmetadata()
                return
            except PotentialGap:
                logging.debug('Potential gap detected!')
                self.__currentfile += 1
335

336
    def __resetcurrenttime(self):
337 338 339
        self.__twstart = self.__origstarttime
        self.__twend = self.__origendtime
        self.__currentfile = None
340 341 342
        self.__search_data()

        # print('reset', self.__twstart, self.starttime)
343

344
    def __readmetadata(self):
345 346
        # Metadata
        logs = logging.getLogger('Read Metadata')
347 348
        # handler = logging.StreamHandler(sys.stdout)
        # logs.addHandler(handler)
349 350 351
        self.__fi.seek(self.__HEADERLEN, 0)

        # Number of objects (unsigned int - 32b)
352 353
        numobjects = struct.unpack('%cI' % self.__endian, self.__fi.read(4))[0]
        logs.debug('Number of objects in metadata: %s' % numobjects)
354

355
        curstarttime = None
356 357
        datatype = None
        numchannels = 0
358
        # chunkSize = 0
359
        for obj in range(numobjects):
360
            # channelSize = 0
361
            objpath = self.__readstring()
362 363
            # logs.debug('Object %s: %s' % (obj, objPath))

364
            self.metadata[obj] = {'path': objpath}
365

366
            rawdataidx = struct.unpack('%cI' % self.__endian, self.__fi.read(4))[0]
367

368 369
            if rawdataidx == self.__FF32b:
                logs.debug('No raw data assigned to segment %s' % obj)
370 371 372 373 374 375
                self.metadata[obj]['data'] = False
                self.__readproperties(self.metadata[obj])

                try:
                    if self.sampling_rate is None:
                        self.sampling_rate = self.metadata[obj]['SamplingFrequency[Hz]']
Javier Quinteros's avatar
Javier Quinteros committed
376
                except KeyError:
377 378 379
                    pass

                try:
380
                    curstarttime = self.metadata[obj]['GPSTimeStamp']
381 382
                    if self.starttime is None:
                        self.starttime = self.metadata[obj]['GPSTimeStamp']
Javier Quinteros's avatar
Javier Quinteros committed
383
                except KeyError:
384 385 386 387
                    pass

                continue

388
            elif not rawdataidx:
389 390
                logs.debug('Raw data index in this segment matches the index the same object had in the previous '
                           'segment')
391 392 393

            else:
                self.metadata[obj]['data'] = True
394 395
                self.metadata[obj]['id'] = numchannels
                numchannels += 1
396

397
                # There is raw data!
398 399
                # sizeBytes = None
                datatype, arraylen, numvalues = struct.unpack('%cIIQ' % self.__endian, self.__fi.read(16))
400 401 402 403 404 405 406
                if datatype == 0x20:
                    self.metadata[obj]['sizeBytes'] = struct.unpack('%cQ' % self.__endian, self.__fi.read(8))[0]

                if arraylen != 1:
                    logs.error('Array length MUST be 1! Actual value: %s' % arraylen)

                self.metadata[obj]['datatype'] = self.__data2mask[datatype][0]
407 408
                self.metadata[obj]['sampling_rate'] = self.sampling_rate
                self.metadata[obj]['starttime'] = curstarttime
409 410 411 412

                self.__readproperties(self.metadata[obj])

        # Set the data type as numpy expects it
413 414 415
        if datatype is None:
            raise Exception('datatype definition not found in any channel!')

416
        if self.__data2mask[datatype][0] == 'h':
417
            self.__datatype = '%ci2' % self.__endian
418
        elif self.__data2mask[datatype][0] == 'f':
419
            self.__datatype = '%cf4' % self.__endian
420 421 422
        else:
            raise Exception('Data type not supported! (%s)' % self.__data2mask[datatype][0])

423 424
        self.__datatypesize = self.__data2mask[datatype][1]
        self.numchannels = numchannels
425 426 427 428 429

        # If channels was not defined at creation time
        if self.__channels is None:
            if self.__chstart >= numchannels:
                raise Exception('No valid channel IDs selected!')
430
            if self.__chstop is None or self.__chstop >= numchannels:
Javier Quinteros's avatar
Javier Quinteros committed
431
                logs.info('Resetting chstop from %s to %s' % (self.__chstop, numchannels-1))
432 433 434 435
                self.__chstop = numchannels - 1
            # Define list of selected channels
            self.__channels = list(range(self.__chstart, self.__chstop + 1, self.__chstep))

436
        self.__samples = int((self.__segmentOffset - self.__dataOffset) / numchannels / self.__datatypesize)
437 438

        # Calculate endtime based on the number of samples declared and the sampling rate
439
        self.endtime = self.starttime + datetime.timedelta(seconds=(self.__samples-1)/self.sampling_rate)
440 441 442 443 444 445 446
        for obj in self.metadata:
            if not self.metadata[obj]['data']:
                continue

            self.metadata[obj]['samples'] = self.__samples
            self.metadata[obj]['endtime'] = curstarttime + \
                datetime.timedelta(seconds=(self.__samples - 1) / self.sampling_rate)
447

448
        # Sample to start extraction from based on the initial datetime of the file (__twstart)
449
        # print(self.__twstart, self.starttime)
450
        self.__samplestart = max(floor((self.__twstart - self.starttime).total_seconds() * self.sampling_rate), 0)
451 452 453
        if self.__samplestart >= self.__samples:
            raise PotentialGap('Start reading at %s, but only %s samples' % (self.__samplestart, self.__samples))

454 455 456
        # Should I readjust __twstart to align it exactly with the time of the samples?
        # print(self.__twstart, self.starttime + datetime.timedelta(seconds=self.__samplestart/self.sampling_rate))
        self.__twstart = self.starttime + datetime.timedelta(seconds=self.__samplestart/self.sampling_rate)
457
        # print(self.__twstart, self.starttime, self.__samplestart)
458 459 460

        self.__samplecur = self.__samplestart

461
        # Open end or beyond this file: read until the last sample
462
        if (self.__twend is None) or (self.__twend >= self.endtime):
463
            self.__sampleend = self.__samples-1
464
        else:
465
            # Otherwise calculate which one is the last sample to read
466 467 468
            self.__sampleend = ceil((self.__twend - self.starttime).total_seconds() * self.sampling_rate)
            # print(self.__twend, self.starttime, (self.__twend - self.starttime).total_seconds(), self.__sampleend)

469
        logs.debug('Samples: %s' % self.__samples)
470 471
        logs.debug('Samples selected: %s-%s' % (self.__samplestart, self.__sampleend))
        logs.debug('Total chunks size: %s' % (self.__segmentOffset - self.__dataOffset))
472 473
        channellength = ((self.__segmentOffset - self.__dataOffset)/numchannels/self.__data2mask[datatype][1])
        logs.debug('Length of channel: %d' % channellength)
474 475

        # New or changed objects
Javier Quinteros's avatar
Javier Quinteros committed
476
        # newobjects = struct.unpack('%cI' % self.__endian, self.__fi.read(4))[0]
477 478

    def __iter__(self):
479
        """Iterate through data (or metadata) and filter and decimate if requested"""
480 481 482 483

        # Create logger
        logs = logging.getLogger('__iter__')

484
        if self.iterate == 'M':
485 486 487
            for info in self.__iter_metadata__():
                yield info

488
        else:
489 490 491 492 493 494
            # In the case that we iterate to get data
            # If no decimation is needed
            if self.__decimate == 1:
                for info in self.__iter_data__():
                    yield info
            else:
495 496
                # Use an input buffer to store the data coming in chunks from __iter_data__
                inbuf = dict()
497 498 499
                # Keep values in a buffer before returning them to be packed in miniSEED records
                # Otherwise, the chunks could be too short and the final size of the mSEED
                # would be too large
500
                outbuf = dict()
501 502 503
                # Keep filtered values which should be later decimated
                nodecimation = dict()
                # Time of the first sample in the next file
504
                expectedtime = dict()
505 506
                # Flag to know if we are at the beginning of the chunk
                startofchunk = True
507

508 509
                for data, stats in self.__iter_data__():
                    ch = int(stats['station'])
510 511 512 513

                    # Check expected time for each channel
                    if ch in expectedtime:
                        if expectedtime[ch] != stats['starttime']:
514
                            logs.warning('GAP! Expected: %s ; Current: %s' % (expectedtime[ch], stats['starttime']))
515 516
                            # Gap!
                            if ch in inbuf:
517
                                logs.debug('Remove last %s components of previous chunk' % len(inbuf[ch]['data']))
518 519
                                del inbuf[ch]

520
                            # Flush outbuf?
521
                            if ch in outbuf:
522 523 524
                                leftover = len(nodecimation[ch]) % 5
                                # outbuf[ch]['stats']['starttime'] += 1/outbuf[ch]['stats']['sampling_rate']
                                outbuf[ch]['stats']['npts'] = 1
Javier Quinteros's avatar
Javier Quinteros committed
525
                                yield nodecimation[ch][-leftover::5], outbuf[ch]['stats']
526
                                logs.debug('Flushing: %s %s' % (nodecimation[ch][-leftover::5], outbuf[ch]['stats']))
527
                                del outbuf[ch]
528
                            # self.__buffer[ch] = None
529
                            del expectedtime[ch]
530 531
                            # Start the processing of a new chunk after the gap
                            startofchunk = True
532

533
                    expectedtime[ch] = stats['starttime'] + stats['npts']/self.sampling_rate
534

535 536 537 538 539
                    # Store values coming from __iter_data__
                    if ch not in inbuf:
                        # Initialize an idx and array with the first chunk of data
                        inbuf[ch] = {'data': np.array(data),
                                     'stats': stats}
540
                        # print('inbuf: %s' % inbuf[ch]['stats'])
541 542 543
                    else:
                        inbuf[ch]['data'] = np.append(inbuf[ch]['data'], data)

544
                    # Store values resulting from the convolution
545
                    if ch not in outbuf:
546
                        # Initialize an idx and array with the first chunk of data
547
                        outbuf[ch] = {'stats': inbuf[ch]['stats'].copy()}
548 549

                        # Modify the starting time only if it is the beginning of the signal
550
                        if startofchunk:
551
                            outbuf[ch]['stats']['starttime'] += (len(self.__filter)-1)/(2 * outbuf[ch]['stats']['sampling_rate'])
552
                            startofchunk = False
553

554
                        # Change the headers to reflect the decimation
555
                        outbuf[ch]['stats']['sampling_rate'] = stats['sampling_rate']/5
556 557
                        # print('outbuf new: %s' % outbuf[ch]['stats'])

558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
                    # If there are not enough components move to the next chunk
                    if len(inbuf[ch]['data']) < len(self.__filter):
                        continue

                    # Convolution of inbuf with filter and then leave the last 234 values (len(filter)-1)
                    # This convolution is performed on a long array (inbuf) and a default filter with 235 components
                    nodecimation[ch] = np.convolve(inbuf[ch]['data'], self.__filter, 'valid').astype(data.dtype)
                    logs.debug('filtered: %d components' % len(nodecimation[ch]))
                    logs.debug('filtered[%d][:11] %s' % (ch, nodecimation[ch][:11]))
                    logs.debug('filtered[%d][-11:] %s' % (ch, nodecimation[ch][-11:]))

                    # Check if we can copy as many components as a multiple of the decimation factor
                    leftover = len(nodecimation[ch]) % 5
                    logs.debug('filtered: leave %d components for next iteration %s' % (leftover, nodecimation[ch][-leftover:]))
                    if leftover:
                        if 'data' not in outbuf[ch]:
                            outbuf[ch]['data'] = nodecimation[ch][:-leftover][::5]
                        else:
                            outbuf[ch]['data'] = np.append(outbuf[ch]['data'], nodecimation[ch][:-leftover][::5])
577
                    else:
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
                        if 'data' not in outbuf[ch]:
                            outbuf[ch]['data'] = nodecimation[ch][::5]
                        else:
                            outbuf[ch]['data'] = np.append(outbuf[ch]['data'], nodecimation[ch][::5])

                    logs.debug('outbuf[%d][:11] %s' % (ch, outbuf[ch]['data'][:11]))
                    logs.debug('outbuf[%d][-11:] %s' % (ch, outbuf[ch]['data'][-11:]))

                    # Keep values which will be needed to start again in the next file
                    # to avoid boundary effects. leftover comes from the extra values we already calculated
                    # but we could not use because of decimation. Therefore, we will put them in the beginning
                    # and calculate again
                    valuesprocessed = len(inbuf[ch]['data']) - len(self.__filter) + 1 - leftover
                    logs.debug('values processed: %d' % valuesprocessed)
                    inbuf[ch]['data'] = inbuf[ch]['data'][-len(self.__filter)+1-leftover:]
                    inbuf[ch]['stats']['starttime'] += valuesprocessed / inbuf[ch]['stats']['sampling_rate']
594 595

                    # If there is enough data
596 597
                    if len(outbuf[ch]['data']) > 2000:
                        outbuf[ch]['stats']['npts'] = len(outbuf[ch]['data'])
598
                        # print('outbuf: %s npts' % outbuf[ch]['index'])
599
                        yield outbuf[ch]['data'], outbuf[ch]['stats']
600
                        # Reset outbuf with an empty array and the next starttime (in headers)
601
                        outbuf[ch]['stats']['starttime'] += len(outbuf[ch]['data']) / outbuf[ch]['stats']['sampling_rate']
602
                        del outbuf[ch]['data']
603 604 605

                # Do I need here to flush all remaining components from outbuf?
                for ch in outbuf:
606 607
                    if ('data' in outbuf[ch]) and len(outbuf[ch]['data']):
                        outbuf[ch]['stats']['npts'] = len(outbuf[ch]['data'])
608
                        yield outbuf[ch]['data'], outbuf[ch]['stats']
609 610

    def __iter_data__(self):
611
        """Read data from files based on channel selection"""
612 613 614
        # Data
        logs = logging.getLogger('Iterate Data')

615 616 617
        while (self.__twend is None) or (self.__twstart < self.__twend):
            data = self.__readdata(channels=self.__channels)
            # Loop through channels
618
            for ch in self.__channels:
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
                stats = {'network': 'XX', 'station': '%05d' % ch, 'location': '',
                         'channel': 'FH1', 'npts': len(data[ch]),
                         'sampling_rate': self.sampling_rate,
                         'starttime': UTCDateTime(self.__twstart),
                         'mseed': {'byteorder': self.__endian,
                                   'reclen': 512}}
                logs.debug('Data length: %d; First component: %s' % (len(data[ch]), data[ch][0]))
                yield data[ch], stats

            # No more data in this file. Skip to the next one.
            self.__currentfile += 1
            try:
                logs.debug('Moving to next file...')
                self.__search_data()
            except IndexError:
                break
635 636 637 638 639

    def __iter_metadata__(self):
        # Metadata
        # logs = logging.getLogger('Iterate Metadata')

640
        # channels = list(range(self.__chstart, self.__chstop+1, self.__chstep))
641

642
        while (self.__twend is None) or (self.__twstart < self.__twend):
643
            for obj in self.metadata:
644 645
                # if 'id' in self.metadata[obj] and self.metadata[obj]['id'] not in channels:
                if 'id' in self.metadata[obj] and self.metadata[obj]['id'] not in self.__channels:
646 647
                    continue
                yield self.metadata[obj]
648 649 650 651

            # No more data in this file. Skip to the next one.
            self.__currentfile += 1
            try:
652
                self.__search_data()
653 654 655 656
            except IndexError:
                break

    def __readstring(self):
657 658 659 660 661 662
        """All strings in TDMS files, such as object paths, property names, property values, and raw data values,
        are encoded in UTF-8 Unicode. All of them, except for raw data values, are preceded by a 32-bit unsigned
        integer that contains the length of the string in bytes, not including the length value itself. Strings in
        TDMS files can be null-terminated, but since the length information is stored, the null terminator will be
        ignored when you read from the file.
        """
663 664 665 666 667 668
        # logs = logging.getLogger('readstring')
        strlen = struct.unpack('%cI' % self.__endian, self.__fi.read(4))
        # logs.debug('String of length %s' % strlen)
        return self.__fi.read(strlen[0]).decode()

    def __readvalue(self):
669
        # logs = logging.getLogger('readvalue')
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685

        datatype = self.__readdatatype()
        # logs.debug('datatype: 0x%x' % datatype)

        # Consider cases which need another read
        # 0x20 is a string. Read again!
        if datatype == 0x20:
            return self.__readstring()

        (mask, numBytes) = self.__data2mask[datatype]
        # logs.debug('Mask: %s; Bytes: %s' % (mask, numBytes))
        # This instruction returns a tuple. Needed for timestamps
        result = struct.unpack('%c%s' % (self.__endian, mask), self.__fi.read(numBytes))

        # 0x44 is a timestamp. Read again!
        if datatype == 0x44:
686
            result = tup2time(*result)
687 688 689 690 691 692 693
        else:
            # Disassemble the tuple if not a timestamp
            result = result[0]

        # logs.debug('result: %s' % result)
        return result

Javier Quinteros's avatar
Javier Quinteros committed
694
    def __readproperties(self, result=None):
695 696 697 698 699
        """For each property, the following information is stored:
            Name (string)
            Data type (tdsDataType)
            Value (numerics stored binary, strings stored as explained above).
        """
700 701
        logs = logging.getLogger('readproperties')

Javier Quinteros's avatar
Javier Quinteros committed
702 703 704
        if result is None:
            result = dict()

705 706 707
        numprops = struct.unpack('%cI' % self.__endian, self.__fi.read(4))[0]
        if numprops:
            logs.debug('%s properties' % numprops)
708

709 710
        for prop in range(numprops):
            propstr = self.__readstring()
711
            value = self.__readvalue()
712
            result[propstr] = value
713
            if self.iterate == 'M':
714
                logs.debug('%s: %s' % (propstr, value))
715 716 717 718 719 720

        return result

    def __readdatatype(self):
        return struct.unpack('%cI' % self.__endian, self.__fi.read(4))[0]

721
    def __readdata(self, channels=None):
722 723
        """Read a chunk of data from the specified channels.
        Update the attribute __samplecur
724

725 726 727 728
        :param channels: List of channel numbers to read data from
        :type channels: list
        :return: Dictionary with channel as key and a numpy array with data as value
        :rtype: dict
729 730
        """

731 732 733 734 735 736 737 738 739 740 741 742 743
        logs = logging.getLogger('Read data')
        logs.setLevel(self.__loglevel)

        # If there is no channel specified read from all selected channels
        if channels is None:
            channels = self.__channels
        else:
            for ch in channels:
                # All channels must be within the originally selected channels
                if ch not in self.__channels:
                    logs.error('Trying to read data from an unselected channel!')
                    raise Exception('Trying to read data from an unselected channel!')

744 745
        result = dict()

746
        numsamples = self.__sampleend - self.__samplestart + 1
747
        # numSamples = min(self.__sampleend - self.__samplecur + 1, self.__MAXSAMPLES)
748
        if not self.__hasInterleavedData:
749 750 751
            for ch in channels:
                # Seek where the channel starts and add the offset to the first
                # sample to read based in the time window selection
752 753
                # self.__fi.seek(self.__dataOffset + self.__datatypesize*self.__samples*channel + self.__samplestart, 0)
                self.__fi.seek(self.__dataOffset + self.__datatypesize*self.__samples*ch + self.__samplecur, 0)
754

755
                # Read all selected data from the channel in one step
756
                result[ch] = np.fromfile(self.__fi, dtype=self.__datatype, count=numsamples)
757 758 759 760

            # Update the current sample number based on the number of components read
            self.__samplecur += numsamples

761
            return result
762 763 764

        # Seek where the raw data starts and add the offset to the first
        # sample to read based in the time window selection
765 766
        # self.__fi.seek(self.__dataOffset + self.__samplestart*self.__datatypesize*self.numchannels, 0)
        self.__fi.seek(self.__dataOffset + self.__samplecur*self.__datatypesize*self.numchannels, 0)
767

768
        # Reserve the data for the result
769
        for ch in channels:
770
            result[ch] = np.zeros((numsamples,), dtype=self.__datatype)
771

772
        for sample in range(numsamples):
773
            # Read from all channels and select the specific one with an index (channel)
774
            allchannels = np.fromfile(self.__fi, dtype=self.__datatype, count=self.numchannels)
775 776
            for ch in channels:
                result[ch][sample] = allchannels[ch]
777

778 779 780
        # Update the current sample number based on the number of components read
        self.__samplecur += numsamples

781
        return result