Source code for stosim.analysis.compressor

# -*- coding: utf-8 -*-


Compresses data (e.g. averaging)

import os
import math

[docs]def avg_stats(xCol, yCol, numFiles, filePrefix='', fileSuffix='', filePath='.', delim=',', outName=None): ''' This function can take several data files and transfer them into a file that is formatted ready to be plotted by gnuplot (each line is "<x> <mean of y vals> <std of y vals>"). Data files should be named using an index starting by 1 and all have the same prefix and/or suffix arround that index. In gnuplot, you could then say "plot outName smooth unique with yerrorlines" :param int xCol: x column :param int yCol: y column :param int numFiles: the number of files to average over :param string filePrefix: prefix in filenames :param string fileSuffix: suffix in filenames :param string filePath: path to files :param string delim: delimiter used between columns :param string outName: Name of the result file, defaults to <filePrefix><yCol><fileSuffix>.out ''' assert os.path.exists(filePath), 'File path %s does not exist' % filePath if len(filePath) > 1 and not filePath.endswith("/"): filePath += "/" if numFiles is None: numFiles = len([f for f in os.listdir(filePath) if f.endswith(fileSuffix) and f.startswith(filePrefix)]) assert numFiles, 'numFiles is zero or not set' assert xCol, 'xCol is not set' assert yCol, 'yCol is not set' # ---- First, we'll collect all y values from all files d = {} # this will store a list of y values for each x for i in range(1, numFiles+1, 1): f = open(filePath + filePrefix + str(i) + fileSuffix, 'r') hasMoreRows = True while hasMoreRows: s = f.readline().strip().split(delim) if s == ['']: hasMoreRows = False else: # disregard comments and unsuitable lines if s[0].startswith('#') or len(s) < xCol or len(s) < yCol: continue x = s[xCol-1].strip() if not x == '' and x not in d: d[x] = [] try: # we assume that y values are numeric! Also,other # errors might happen here when file is corrupted d[x].append(float(s[int(yCol)-1])) except Exception as e: print("ERROR") f.close() # ---- Then, we compute mean and standard deviation for the v values for # each x and write them to target file if outName is None: outName = '%s%s%s%s.out' % (filePath, filePrefix, str(yCol), fileSuffix) out = open(outName, 'w') keys = list(d.keys()) keys.sort() for x in keys: # -- mean sum = 0.0 for y in d[x]: sum += y mean = sum / float(len(d[x])) # -- standard deviation (std) or standard error (ste) # On the difference between them, see the very readable intro at # std = 0.0 for y in d[x]: std += math.pow(y - mean, 2) l = len(d[x]) if l > 1: l -= 1 std /= l std = math.sqrt(std) ste = std / math.sqrt(len(d[x])) # we're not using this, #40 should give a configurable choice out.write('%s %f %f\n' % (str(x), mean, std)) out.close()