-
Notifications
You must be signed in to change notification settings - Fork 183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add an option to read in a list of points to scan from a .csv file #320
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from CombineHarvester.CombineTools.combine.CombineToolBase import CombineToolBase | ||
import six | ||
from six.moves import zip | ||
import pandas as pd | ||
|
||
|
||
def isfloat(value): | ||
|
@@ -32,6 +33,8 @@ def attach_intercept_args(self, group): | |
'-m', '--mass', help='Supports range strings for multiple masses, e.g. "120:130:5,140 will produce three combine calls with mass values of 120, 125, 130 and 140"') | ||
group.add_argument( | ||
'--points', help='For use with "-M MultiDimFit --algo grid" to split scan points into separate jobs') | ||
group.add_argument( | ||
'--fromfile', help='The file to read the points from, for use with "-M MultiDimFit --algo fixed"') | ||
group.add_argument( | ||
'--singlePoint', help='Supports range strings for multiple points to test, uses the same format as the --mass argument') | ||
group.add_argument( | ||
|
@@ -88,6 +91,32 @@ def run_method(self): | |
subbed_vars[('SEED',)] = [(sval,) for sval in seed_vals] | ||
self.passthru.extend(['-s', '%(SEED)s']) | ||
|
||
# Handle the --fromfile option | ||
if '--algo' in self.passthru: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conversely, it seems you want to intercept the usual |
||
algo = self.passthru[self.passthru.index('--algo')+1] | ||
else: | ||
algo = None | ||
if self.args.fromfile is not None: | ||
if not os.path.exists(self.args.fromfile): | ||
print("Points file %s does not exist"%self.args.fromfile) | ||
exit(1) | ||
|
||
# Generating points from file is only supported for --algo fixed | ||
if algo != 'fixed': | ||
print(f'--fromfile is only supported for --algo fixed, not {algo}') | ||
exit(1) | ||
|
||
# Check file is a csv file and that the headers are in the model POIs | ||
with open(self.args.fromfile) as f: | ||
points_df = pd.read_csv(f) | ||
if not self.args.points: | ||
self.args.points = len(points_df) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be better not to mix your new implementation with the existing one for |
||
if not self.args.split_points: | ||
self.args.split_points = len(points_df) | ||
if len(points_df) < int(self.args.points): | ||
print(f"Points file has fewer points ({len(points_df)}) than requested ({self.args.points})") | ||
exit(1) | ||
|
||
for i, generate in enumerate(self.args.generate): | ||
split_char = ':' if '::' in generate else ';' | ||
gen_header, gen_content = generate.split(split_char*2) | ||
|
@@ -210,14 +239,16 @@ def run_method(self): | |
if put_back_ranges: | ||
self.put_back_arg('setParameterRanges', '--setParameterRanges') | ||
|
||
if self.args.points is not None: | ||
if self.args.points is not None and algo != 'fixed': | ||
self.passthru.extend(['--points', self.args.points]) | ||
groupby = 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move this declaration closer to where it used for the first time (though I'm not sure its needed) |
||
if (self.args.split_points is not None and | ||
self.args.split_points > 0 and | ||
self.args.points is not None): | ||
self.args.points is not None | ||
and self.args.fromfile is None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as above, better not to have to modify this code |
||
points = int(self.args.points) | ||
split = self.args.split_points | ||
start = 0 | ||
start = 0 | ||
ranges = [] | ||
while (start + (split - 1)) < points: | ||
# filename = "higgsCombine"+self.args.name+".POINTS."+str(start)+"."+str(start+(split-1))+".MultiDimFit.mH"+str(self.args.mass)+".root" | ||
|
@@ -238,18 +269,48 @@ def run_method(self): | |
['--firstPoint %(P_START)s --lastPoint %(P_END)s']) | ||
self.args.name += '.POINTS.%(P_START)s.%(P_END)s' | ||
|
||
# Handle --fromfile option | ||
if (self.args.split_points is not None and | ||
self.args.split_points > 0 and | ||
self.args.points is not None | ||
and self.args.fromfile is not None): | ||
points = int(self.args.points) | ||
start = 0 | ||
groupby = self.args.split_points | ||
ranges = [] | ||
while (start) < points: | ||
ranges.append((start, start)) | ||
start += 1 | ||
if start < points: | ||
ranges.append((start, points - 1)) | ||
points_array = [] | ||
for r in ranges: | ||
points_dict = points_df.iloc[r[0]].to_dict() | ||
points_str = ','.join(f'{k}={v}' for k, v in points_dict.items()) | ||
points_array.append((r[0], r[1], points_str)) | ||
subbed_vars[('P_START', 'P_END', 'fixedPointPOIs')] = points_array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need to track start and end indexes here? If I understand correctly, you just want to create one combine call for each row of your CSV file. In that case, you only need one index (to set in |
||
self.passthru.extend( | ||
['--fixedPointPOIs %(fixedPointPOIs)s']) | ||
self.args.name += '.POINTS.%(P_START)s.%(P_END)s' | ||
|
||
# can only put the name option back now because we might have modified | ||
# it from what the user specified | ||
self.put_back_arg('name', '-n') | ||
proto = 'combine ' + (' '.join(self.passthru)) | ||
if self.args.there: | ||
proto = 'pushd %(DIR)s; combine ' + (' '.join(self.passthru))+'; popd' | ||
|
||
for it in itertools.product(*list(subbed_vars.values())): | ||
job_str = '' | ||
for idx, it in enumerate(itertools.product(*list(subbed_vars.values()))): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please avoid changing this part of the code, I don't think it's necessary - combineTool.py already has a mechanism to group individual combine calls into single jobs ( |
||
keys = list(subbed_vars.keys()) | ||
dict = {} | ||
for i, k in enumerate(keys): | ||
for tuple_i, tuple_ele in enumerate(k): | ||
dict[tuple_ele] = it[i][tuple_i] | ||
self.job_queue.append(proto % dict) | ||
job_str += proto % dict + '\n ' | ||
if ((idx + 1) % groupby) == 0: | ||
self.job_queue.append(job_str.strip()) | ||
job_str = '' | ||
if job_str != '': | ||
self.job_queue.append(job_str.strip()) | ||
self.flush_queue() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function describes what we call
intercept
args - ones that are already used by combine that we want to extract/manipulate. Since you are adding a new arg, this should go in theattach_args
method below.