From c5292f8342469c731854c81005cf235ed1288222 Mon Sep 17 00:00:00 2001 From: jkuhl-uni Date: Fri, 17 Dec 2021 15:16:17 +0100 Subject: [PATCH] implemented idl into sfcf-read method --- pyerrors/input/sfcf.py | 255 +++++++++++++++------------------------- pyerrors/input/utils.py | 17 ++- 2 files changed, 105 insertions(+), 167 deletions(-) diff --git a/pyerrors/input/sfcf.py b/pyerrors/input/sfcf.py index 5915c56e..8ba9a3da 100644 --- a/pyerrors/input/sfcf.py +++ b/pyerrors/input/sfcf.py @@ -6,125 +6,41 @@ import fnmatch import re import numpy as np # Thinly-wrapped numpy from ..obs import Obs - - -def read_sfcf_old(path, prefix, name, quarks, noffset = 0, wf=0, wf2=0, **kwargs): - """Read sfcf format (from around 2012) from given folder structure. - - Keyword arguments - ----------------- - im -- if True, read imaginary instead of real part of the correlation function. - single -- if True, read a boundary-to-boundary correlation function with a single value - b2b -- if True, read a time-dependent boundary-to-boundary correlation function - names -- Alternative labeling for replicas/ensembles. Has to have the appropriate length - """ - if kwargs.get('im'): - im = 1 - part = 'imaginary' - else: - im = 0 - part = 'real' - - b2b = 0 - - if kwargs.get('b2b'): - b2b = 1 - - quarks = quarks.split(" ") - read = 0 - T = 0 - start = 0 - ls = [] - for (dirpath, dirnames, filenames) in os.walk(path): - ls.extend(dirnames) - break - if not ls: - print('Error, directory not found') - #sys.exit() - for exc in ls: - if fnmatch.fnmatch(exc, prefix + '*'): - ls = list(set(ls) - set(exc)) - if len(ls) > 1: - ls.sort(key=lambda x: int(re.findall(r'\d+', x[len(prefix):])[0])) - replica = len(ls) - print('Read', part, 'part of', name, 'from', prefix, ',', replica, 'replica') - if 'names' in kwargs: - new_names = kwargs.get('names') - if len(new_names) != replica: - raise Exception('Names does not have the required length', replica) - else: - new_names = ls - print(replica, 'replica') - for i, item in enumerate(ls): - print(item) - sub_ls = [] - for (dirpath, dirnames, filenames) in os.walk(path+'/'+item): - sub_ls.extend(dirnames) - break - for exc in sub_ls: - if fnmatch.fnmatch(exc, 'cfg*'): - sub_ls = list(set(sub_ls) - set(exc)) - sub_ls.sort(key=lambda x: int(x[3:])) - no_cfg = len(sub_ls) - print(no_cfg, 'configurations') - if i == 0: - with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp: - for k, line in enumerate(fp): - #check if this is really the right file - pattern = "# "+name+" : offset "+str(noffset)+", wf "+"0" - #if b2b, a second wf is needed - if b2b: - pattern+=", wf_2 "+"0" - pattern+=" : "+quarks[0]+" - "+quarks[1] - - if read == 1 and not line.strip() and k > start + 1: - break - if read == 1 and k >= start: - T += 1 - if pattern in line: - #print(line) - read = 1 - start = k+1 - print(str(T)+" entries found.") - - deltas = [] - for j in range(T): - deltas.append([]) - - sublength = len(sub_ls) - for j in range(T): - deltas[j].append(np.zeros(sublength)) - - for cnfg, subitem in enumerate(sub_ls): - with open(path + '/' + item + '/' + subitem + '/'+name) as fp: - for k, line in enumerate(fp): - if(k >= start and k < start + T): - floats = list(map(float, line.split())) - deltas[k-start][i][cnfg] = floats[im] - - - result = [] - for t in range(T): - result.append(Obs(deltas[t], new_names)) - - return result - +from . import utils def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs): """Read sfcf c format from given folder structure. Parameters ---------- - quarks -- Label of the quarks used in the sfcf input file - noffset -- Offset of the source (only relevant when wavefunctions are used) - wf -- ID of wave function - wf2 -- ID of the second wavefunction (only relevant for boundary-to-boundary correlation functions) - im -- if True, read imaginary instead of real part of the correlation function. - b2b -- if True, read a time-dependent boundary-to-boundary correlation function - single -- if True, read time independent boundary to boundary correlation function - names -- Alternative labeling for replicas/ensembles. Has to have the appropriate length + quarks: str + Label of the quarks used in the sfcf input file. e.g. "quark quark" + for version 0.0 this does NOT need to be given with the typical " - " that is present in the output file, + this is done automatically for this version + noffset: int + Offset of the source (only relevant when wavefunctions are used) + wf: int + ID of wave function + wf2: int + ID of the second wavefunction (only relevant for boundary-to-boundary correlation functions) + im: bool + if True, read imaginary instead of real part of the correlation function. + b2b: bool + if True, read a time-dependent boundary-to-boundary correlation function + single: bool + if True, read time independent boundary to boundary correlation function + names: list + Alternative labeling for replicas/ensembles. Has to have the appropriate length ens_name : str replaces the name of the ensemble + version: str + version of SFCF, with which the measurement was done. if the compact output option (-c) was spectified, append a c to the version (e.g. "1.0c") + replica: list + list of replica to be read, default is all + files: list + list of files to be read per replica, default is all. for non-conpact ouztput format, hand the folders to be read here. + check_configs: + list of list of supposed configs, eg. [range(1,1000)] for one replicum with 1000 configs """ if kwargs.get('im'): im = 1 @@ -142,8 +58,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) else: b2b = 0 single = 0 - - files = [] + if "replica" in kwargs: + reps = kwargs.get("replica") if "files" in kwargs: files = kwargs.get("files") @@ -172,8 +88,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) if not ls: raise Exception('Error, directory not found') # Exclude folders with different names - if len(files) != 0: - ls = files + if "replica" in kwargs: + ls = reps else: for exc in ls: if not fnmatch.fnmatch(exc, prefix + '*'): @@ -182,9 +98,11 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) ls.sort(key=lambda x: int(re.findall(r'\d+', x[len(prefix):])[0])) # New version, to cope with ids, etc. replica = len(ls) print('Read', part, 'part of', name, 'from', prefix[:-1], ',', replica, 'replica') - + idl = [] if 'names' in kwargs: new_names = kwargs.get('names') + if len(new_names)!=len(set(new_names)): + raise Exception("names are nor unique!") if len(new_names) != replica: raise Exception('Names does not have the required length', replica) else: @@ -194,59 +112,65 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) try: idx = entry.index('r') except: - idx = len(entry)-2 + raise Exception("Automatic recognition of replicum failed, please enter the key word 'names'.") + if 'ens_name' in kwargs: new_names.append(kwargs.get('ens_name') + '|' + entry[idx:]) else: new_names.append(entry[:idx] + '|' + entry[idx:]) for i, item in enumerate(ls): sub_ls = [] - for (dirpath, dirnames, filenames) in os.walk(path + '/' + item): - if compact: - sub_ls.extend(filenames) - else: - sub_ls.extend(dirnames) - break - - #print(sub_ls) - for exc in sub_ls: - if compact: - if not fnmatch.fnmatch(exc, prefix + '*'): - sub_ls = list(set(sub_ls) - set([exc])) - sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1])) - else: - if not fnmatch.fnmatch(exc, 'cfg*'): - sub_ls = list(set(sub_ls) - set([exc])) - sub_ls.sort(key=lambda x: int(x[3:])) - - if compact: - first_cfg = int(re.findall(r'\d+', sub_ls[0])[-1]) - - last_cfg = len(sub_ls) + first_cfg - 1 - - for cfg in range(1, len(sub_ls)): - if int(re.findall(r'\d+', sub_ls[cfg])[-1]) != first_cfg + cfg: - last_cfg = cfg + first_cfg - 1 - break - - no_cfg = last_cfg - first_cfg + 1 - print(item, ':', no_cfg, 'evenly spaced configurations (', first_cfg, '-', last_cfg, ') ,', len(sub_ls) - no_cfg, 'configs omitted\n') + if "files" in kwargs: + sub_ls = kwargs.get("files") + sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1])) else: - no_cfg = len(sub_ls) - print(no_cfg, 'configurations') - - #here we have found all the files we need to look into. + for (dirpath, dirnames, filenames) in os.walk(path + '/' + item): + if compact: + sub_ls.extend(filenames) + else: + sub_ls.extend(dirnames) + break + + #print(sub_ls) + for exc in sub_ls: + if compact: + if not fnmatch.fnmatch(exc, prefix + '*'): + sub_ls = list(set(sub_ls) - set([exc])) + sub_ls.sort(key=lambda x: int(re.findall(r'\d+', x)[-1])) + else: + if not fnmatch.fnmatch(exc, 'cfg*'): + sub_ls = list(set(sub_ls) - set([exc])) + sub_ls.sort(key=lambda x: int(x[3:])) + #print(sub_ls) + rep_idl = [] + no_cfg = len(sub_ls) + for cfg in sub_ls: + try: + if compact: + rep_idl.append(int(cfg.split("n")[-1])) + else: + rep_idl.append(int(cfg[3:])) + except: + raise Exception("Couldn't parse idl from directroy, problem with file "+cfg) + rep_idl.sort() + #maybe there is a better way to print the idls + print(item, ':', no_cfg, ' configurations') + idl.append(rep_idl) + #here we have found all the files we need to look into. if i == 0: + #here, we want to find the place within the file, where the correlator we need is stored. + if compact: - + #to do so, the pattern needed is put together from the input values pattern = 'name ' + name + '\nquarks ' + quarks + '\noffset ' + str(noffset) + '\nwf ' + str(wf) if b2b: pattern += '\nwf_2 ' + str(wf2) - + #and the file is parsed through to find the pattern with open(path + '/' + item + '/' + sub_ls[0], 'r') as file: content = file.read() match = re.search(pattern, content) if match: + #the start and end point of the correlator in quaetion is extracted for later use in the other files start_read = content.count('\n', 0, match.start()) + 5 + b2b end_match = re.search(r'\n\s*\n', content[match.start():]) T = content[match.start():].count('\n', 0, end_match.start()) - 4 - b2b @@ -255,11 +179,11 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) else: raise Exception('Correlator with pattern\n' + pattern + '\nnot found.') else: - #print(path + '/' + item + '/')# + sub_ls[0] + '/' + name) + #this part does the same as above, but for non-compactified versions of the files with open(path + '/' + item + '/' + sub_ls[0] + '/' + name) as fp: for k, line in enumerate(fp): if version == "0.0": - #check if this is really the right file + #check if this is really the right file by matchin pattern similar to above pattern = "# "+name+" : offset "+str(noffset)+", wf "+str(wf) #if b2b, a second wf is needed if b2b: @@ -284,19 +208,24 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) T -= b2b print(str(T)+" entries found.") #we found where the correlator that is to be read is in the files + #after preparing the datastructure the correlators get parsed into... deltas = [] for j in range(T): deltas.append([]) - + sublength = no_cfg for j in range(T): deltas[j].append(np.zeros(sublength)) + #... the actual parsing can start. we iterate through all measurement files in the path given... if compact: for cfg in range(no_cfg): with open(path + '/' + item + '/' + sub_ls[cfg]) as fp: lines = fp.readlines() + #check, if the correlator is in fact printed completely if(start_read + T>len(lines)): raise Exception("EOF before end of correlator data! Maybe "+path + '/' + item + '/' + sub_ls[cfg]+" is corrupted?") + #and start to read the correlator. + #the range here is chosen like this, since this allows for implementing a security check for every read correlator later... for k in range(start_read - 6,start_read + T): if k == start_read - 5 - b2b: if lines[k].strip() != 'name ' + name: @@ -307,6 +236,8 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) else: for cnfg, subitem in enumerate(sub_ls): with open(path + '/' + item + '/' + subitem + '/' + name) as fp: + #since the non-compatified files are typically not so long, we can iterate over the whole file. + #here one can also implement the chekc from above. for k, line in enumerate(fp): if(k >= start and k < start + T): floats = list(map(float, line.split())) @@ -315,9 +246,17 @@ def read_sfcf(path, prefix, name, quarks='.*', noffset=0, wf=0, wf2=0, **kwargs) else: deltas[k - start][i][cnfg] = floats[1 + im - single] - + if "check_configs" in kwargs: + print("Chekcing for missing configs...") + che = kwargs.get("check_configs") + if not (len(che) == len(idl)): + raise Exception("check_configs has to be the same length as replica!") + for r in range(len(idl)): + print("checking "+new_names[r]) + utils.check_idl(idl[r], che[r]) + print("Done") result = [] for t in range(T): - result.append(Obs(deltas[t], new_names)) + result.append(Obs(deltas[t], new_names, idl = idl)) return result diff --git a/pyerrors/input/utils.py b/pyerrors/input/utils.py index f4264587..a8dd026e 100644 --- a/pyerrors/input/utils.py +++ b/pyerrors/input/utils.py @@ -1,14 +1,13 @@ -import fnmatch +"""Utilities for the input""" -def check_missing(idl,che): +def check_idl(idl,che): missing = [] - for ind in che: - if not ind in idl: - missing.append(ind) - if(len(missing) == 0): - print("There are no measurements missing.") - else: - print(len(missing),"measurements missing") + for c in che: + if not c in idl: + missing.append(c) + #print missing such that it can directly be parsed to slurm terminal + if not (len(missing) == 0): + print(len(missing),"configs missing") miss_str = str(missing[0]) for i in missing[1:]: miss_str += ","+str(i)