Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addbdt to jobs #253

Merged
merged 35 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d977065
temporary fix for btagging
cmantill Oct 15, 2024
36abe29
add more variables
cmantill Oct 16, 2024
40ac1fc
add warning when sample is empty
cmantill Oct 16, 2024
142fdbf
merge
cmantill Oct 16, 2024
0622a8e
add script for jmsr templates
cmantill Oct 16, 2024
3a76d99
update
cmantill Oct 16, 2024
7cd8c40
style: pre-commit fixes
pre-commit-ci[bot] Oct 16, 2024
2011389
add variations
cmantill Oct 16, 2024
3074137
fix conflict
cmantill Oct 16, 2024
e600dd4
style: pre-commit fixes
pre-commit-ci[bot] Oct 16, 2024
c0f597b
Merge branch 'main' of github.com:LPC-HH/HH4b
cmantill Oct 16, 2024
6541920
Merge branch 'main' into cmantill-dev
cmantill Oct 16, 2024
bd8b826
merge
cmantill Oct 16, 2024
cb13334
Merge branch 'main' of github.com:LPC-HH/HH4b
cmantill Oct 28, 2024
c31263f
plot bdt curves against TT
cmantill Oct 31, 2024
c0ab0f9
test bdt inference
cmantill Nov 3, 2024
4632a5d
merge
cmantill Nov 3, 2024
42fa207
merge
cmantill Jan 20, 2025
050353d
merge
cmantill Jan 20, 2025
60e47a7
add debug
cmantill Jan 20, 2025
2b9e216
add exact version of xgboost
cmantill Jan 20, 2025
a96938a
revert back to >= xgboost
cmantill Jan 20, 2025
7eb8b40
add debug
cmantill Jan 20, 2025
460210f
add exact version again
cmantill Jan 20, 2025
4fd5b2a
add more colors
cmantill Jan 27, 2025
a1e169a
clean printouts
cmantill Jan 31, 2025
8eeb81a
style: pre-commit fixes
pre-commit-ci[bot] Jan 31, 2025
f42d4fa
Merge branch 'main' into addbdt-to-jobs
cmantill Jan 31, 2025
b40e13e
fix syntax
cmantill Jan 31, 2025
0189c6e
fix syntax
cmantill Jan 31, 2025
a2bddaa
change bdt for 24Nov7_v5_glopartv2_rawmass version
cmantill Jan 31, 2025
803a41e
style: pre-commit fixes
pre-commit-ci[bot] Jan 31, 2025
da11217
fix key vars
cmantill Jan 31, 2025
4154d17
add bdt variations
cmantill Jan 31, 2025
cda0cdb
style: pre-commit fixes
pre-commit-ci[bot] Jan 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ tabulate
tqdm==4.65.0
uproot==4.3.7
vector==1.1.0
xgboost>=2.0.3
xgboost==2.0.3
xxhash
40 changes: 25 additions & 15 deletions src/HH4b/boosted/ValidateAK8Tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@
logger = logging.getLogger("TrainBDT")


use_ttbar = False


def load_events(
path_to_dir, year, jet_collection, pt_cut, msd_cut, jet_coll_pnet, match_higgs, num_jets
):
add_ttbar = False

sample_dirs = {
year: {
"qcd": [
Expand All @@ -42,8 +43,9 @@ def load_events(
],
}
}
if add_ttbar:
sample_dirs[year]["ttbar"] = ["TTTo4Q"]
if use_ttbar:
sample_dirs = {year: {"ttbar": ["TTto4Q"]}}

sample_dirs_sig = {
year: {
"hh4b": [
Expand Down Expand Up @@ -154,7 +156,8 @@ def get_roc_inputs(
jet_index,
):
sig_key = "hh4b"
bg_keys = ["qcd"]
bg_keys = ["ttbar"] if use_ttbar else ["qcd"]

discriminator = f"{jet_collection}{discriminator_name}"

# 1 for signal, 0 for background
Expand Down Expand Up @@ -219,11 +222,16 @@ def get_roc(
scores = np.concatenate(scores_arr)
weights = np.concatenate(weights_arr)
fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
auc_label = ""
try:
auc_label = f" AUC ({auc(fpr, tpr):.4f})"
except ValueError:
print("AUC invalid")
roc = {
"fpr": fpr,
"tpr": tpr,
"thresholds": thresholds,
"label": discriminator_label + f" AUC ({auc(fpr, tpr):.4f})",
"label": discriminator_label + auc_label,
"color": discriminator_color,
}

Expand All @@ -245,7 +253,7 @@ def main(args):
# w/o trigger selection
tag = "24Sep27_v12v2_private_pre-sel"
year = "2022"
outdir = "24Sep27" # date of plotting
outdir = "24Dec13" # date of plotting
plot_dir = f"/uscms/home/cmantill/nobackup/hh/HH4b/plots/PostProcessing/{outdir}/{year}"
_ = os.system(f"mkdir -p {plot_dir}")
path_to_dir = f"{MAIN_DIR}/{tag}/"
Expand Down Expand Up @@ -279,7 +287,7 @@ def main(args):
events_dict,
jet_collection,
"PNetTXbbLegacy",
"ParticleNet Legacy Hbb vs QCD",
"ParticleNet Legacy TXbb",
"blue",
jet_indices,
pt_cut,
Expand All @@ -290,7 +298,7 @@ def main(args):
events_dict,
jet_collection,
"PNetTXbb",
"ParticleNet 103X Hbb vs QCD",
"ParticleNet 103X TXbb",
"orange",
jet_indices,
pt_cut,
Expand All @@ -301,7 +309,7 @@ def main(args):
events_dict,
jet_collection,
"ParTTXbb",
"GloParTv2 Hbb vs QCD",
"GloParTv2 TXbb",
"red",
jet_indices,
pt_cut,
Expand All @@ -311,9 +319,10 @@ def main(args):
}
# thresholds on the discriminator, used to search for signal efficiency
plot_thresholds = {
# "PNetTXbbLegacy": [0.8, 0.92, 0.975],
"PNetTXbbLegacy": [0.8],
# "PNetTXbb": [0.7],
# "ParTTXbb": [0.38],
"PNetTXbb": [0.7],
"ParTTXbb": [0.3, 0.75, 0.78, 0.9375],
}
# find what the threshold should be to achieve this signal efficiency
find_from_sigeff = {
Expand All @@ -322,18 +331,19 @@ def main(args):
# "PNetTXbb": [0.72],
# "ParTTXbb": [0.72],
}
bkg_label = "TT" if use_ttbar else "QCD"
plotting.multiROCCurveGrey(
{"bb": rocs},
# sig_effs=[0.6],
sig_effs=[],
bkg_effs=[0.01],
bkg_effs=[],
xlim=[0, 1.0],
ylim=[1e-4, 1],
show=True,
plot_dir=Path(plot_dir),
name=f"{jet_collection}{jet_coll_pnet}ROC{''.join(str(x) for x in jet_indices)}_{cut_str}",
name=f"{jet_collection}{jet_coll_pnet}{bkg_label}ROC{''.join(str(x) for x in jet_indices)}_{cut_str}",
title=(
f"AK8 Jets {jet_indices}"
f"AK8 Jets {jet_indices}, {bkg_label}"
if jet_collection == "ak8FatJet"
else f"bb Jets {jet_indices}"
),
Expand Down
34 changes: 15 additions & 19 deletions src/HH4b/boosted/ValidateBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import auc, roc_curve
from sklearn.metrics import roc_curve

import HH4b.utils as utils
from HH4b import hh_vars
Expand Down Expand Up @@ -230,15 +230,10 @@ def get_bdt(events_dict, bdt_model, bdt_model_name, bdt_config, jlabel=""):

def get_roc_inputs(
events_dict,
# jet_collection,
discriminator_name,
# jet_index,
bkgs,
bg_keys,
):
sig_key = "hh4b"

# bg_keys = ["ttbar"]
bg_keys = bkgs
discriminator = f"{discriminator_name}"
print(events_dict["ttbar"])
# 1 for signal, 0 for background
Expand All @@ -253,6 +248,7 @@ def get_roc_inputs(
[events_dict[sig_key]["finalWeight"]]
+ [events_dict[bg_key]["finalWeight"] for bg_key in bg_keys],
)

# discriminator
# print(events_dict[sig_key][discriminator])
scores = np.concatenate(
Expand All @@ -269,10 +265,11 @@ def get_roc(
discriminator_name,
discriminator_label,
discriminator_color,
bkgs,
bg_keys,
):
y_true, scores, weights = get_roc_inputs(events_dict, discriminator_name, bkgs)
y_true, scores, weights = get_roc_inputs(events_dict, discriminator_name, bg_keys)
fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

# make sure fpr is sorted
sorted_indices = np.argsort(fpr)
fpr = fpr[sorted_indices]
Expand All @@ -282,7 +279,7 @@ def get_roc(
"fpr": fpr,
"tpr": tpr,
"thresholds": thresholds,
"label": discriminator_label + f" AUC ({auc(fpr, tpr):.4f})",
"label": discriminator_label, # + f" AUC ({auc(fpr, tpr):.4f})",
"color": discriminator_color,
}

Expand Down Expand Up @@ -390,7 +387,7 @@ def main(args):
)
for year in args.year
}
processes = ["qcd", "ttbar", "hh4b"]
processes = ["hh4b"] + bkgs
bdt_dict_combined = {
key: pd.concat([bdt_dict[year][key] for year in bdt_dict]) for key in processes
}
Expand All @@ -407,23 +404,22 @@ def main(args):
for i, bdt_model in enumerate(bdt_models):

rocs[bdt_model] = get_roc(
bdt_dict_combined,
f"bdtscore_{bdt_model}",
bdt_model,
colors[i],
bkgs,
bdt_dict_combined, f"bdtscore_{bdt_model}", bdt_model, colors[i], bg_keys=bkgs
)

# Plot multi-ROC curve

bkgprocess_key = "-".join(args.processes)
years_key = "_".join(args.year)
output_name = f"PNet-parT-comparison-{bkgprocess_key}-{years_key}"
print(output_name)
multiROCCurveGrey(
restructure_rocs(rocs),
# sig_effs=sig_effs,
# bkg_effs=bkg_effs,
plot_dir=out_dir,
legtitle=get_legtitle("bbFatJetParTTXbb"),
title="ggF HH4b BDT ROC",
name="PNet-parT-comparison",
name=output_name,
plot_thresholds={
"v5_PNetLegacy": [0.98, 0.88, 0.03],
"v5_ParT_rawmass": [0.91, 0.64, 0.03],
Expand All @@ -444,7 +440,7 @@ def main(args):
type=str,
default=["2022EE"],
choices=hh_vars.years,
help="years to train on",
help="years to evaluate on",
)
parser.add_argument(
"--data-path",
Expand Down
11 changes: 10 additions & 1 deletion src/HH4b/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,16 @@ def multiROCCurveGrey(
if xlim is None:
xlim = [0, 1]
line_style = {"colors": "lightgrey", "linestyles": "dashed"}
th_colours = ["cornflowerblue", "deepskyblue", "mediumblue", "cyan", "cadetblue"]
th_colours = [
"cornflowerblue",
"deepskyblue",
"mediumblue",
"cyan",
"cadetblue",
"plum",
"purple",
"palevioletred",
]
eff_colours = ["lime", "aquamarine", "greenyellow"]

fig = plt.figure(figsize=(12, 12))
Expand Down
58 changes: 33 additions & 25 deletions src/HH4b/processors/bbbbSkimmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def __init__(
save_systematics=False,
region="signal",
nano_version="v12",
txbb="pnet-legacy",
txbb="glopart-v2",
):
super().__init__()

Expand Down Expand Up @@ -379,7 +379,7 @@ def __init__(
self._accumulator = processor.dict_accumulator({})

# BDT model
bdt_model_name = "24May31_lr_0p02_md_8_AK4Away"
bdt_model_name = "24Nov7_v5_glopartv2_rawmass"
self.bdt_model = xgb.XGBClassifier()
self.bdt_model.load_model(
fname=f"{package_path}/boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model"
Expand Down Expand Up @@ -807,12 +807,14 @@ def process(self, events: ak.Array):
}

if self._region == "signal":
bdtVars = self.getBDT(bbFatJetVars, vbfJetVars, ak4JetAwayVars, met_pt, "")
print(bdtVars)
skimmed_events = {
**skimmed_events,
**bdtVars,
}
for jshift in ["", "JMS_down", "JMS_up", "JMR_down", "JMR_up"] + list(
self.jecs.values()
):
bdtVars = self.getBDT(bbFatJetVars, vbfJetVars, ak4JetAwayVars, met_pt, jshift)
skimmed_events = {
**skimmed_events,
**bdtVars,
}

if self._region == "semilep-tt":
# concatenate leptons
Expand Down Expand Up @@ -891,7 +893,9 @@ def process(self, events: ak.Array):
# >=1 bb AK8 jets (ordered by TXbb) with TXbb > 0.8
cut_txbb = (
np.sum(
bbFatJetVars[f"bbFatJet{txbb_str}"] >= self.preselection[self.txbb],
(bbFatJetVars[f"bbFatJet{txbb_str}"] >= self.preselection[self.txbb])
| (bbFatJetVars["bbFatJetPNetTXbbLegacy"] >= self.preselection[self.txbb])
| (bbFatJetVars["bbFatJetParTTXbb"] >= self.preselection[self.txbb]),
axis=1,
)
>= 1
Expand Down Expand Up @@ -956,8 +960,10 @@ def process(self, events: ak.Array):
add_selection("ak8_pt_msd", cut_pt_msd, *selection_args)

# == 2 AK8 jets with Xbb>0.1
cut_txbb = (np.sum(ak8FatJetVars["ak8FatJetPNetTXbb"] >= 0.1, axis=1) == 2) | (
np.sum(ak8FatJetVars["ak8FatJetParTTXbb"] >= 0.05, axis=1) == 2
cut_txbb = (
(np.sum(ak8FatJetVars["ak8FatJetPNetTXbb"] >= 0.1, axis=1) == 2)
| (np.sum(ak8FatJetVars["ak8FatJetParTTXbb"] >= 0.05, axis=1) == 2)
| (np.sum(ak8FatJetVars["ak8FatJetPNetTXbbLegacy"] >= 0.1, axis=1) == 2)
)
add_selection("ak8bb_txbb", cut_txbb, *selection_args)

Expand Down Expand Up @@ -1093,35 +1099,37 @@ def getBDT(
"""Calculates BDT"""
key_map = get_var_mapping(jshift)

# makedataframe from 24May31_lr_0p02_md_8_AK4Away
# makedataframe from v5_glopartv2
# 24Nov7_v5_glopartv2_rawmass
# NOTE: this bdt assumes mass = raw mass
jets = vector.array(
{
"pt": bbFatJetVars["bbFatJetPt"],
"phi": bbFatJetVars["bbFatJetPhi"],
"eta": bbFatJetVars["bbFatJetEta"],
"M": bbFatJetVars["bbFatJetPNetMassLegacy"],
"pt": bbFatJetVars[key_map("bbFatJetPt")],
"phi": bbFatJetVars[key_map("bbFatJetPhi")],
"eta": bbFatJetVars[key_map("bbFatJetEta")],
"M": bbFatJetVars[key_map("bbFatJetParTmassVis")],
}
)
h1 = jets[:, 0]
h2 = jets[:, 1]
hh = jets[:, 0] + jets[:, 1]
vbfjets = vector.array(
{
"pt": vbfJetVars["VBFJetPt"],
"phi": vbfJetVars["VBFJetPhi"],
"eta": vbfJetVars["VBFJetEta"],
"M": vbfJetVars["VBFJetMass"],
"pt": vbfJetVars[key_map("VBFJetPt")],
"phi": vbfJetVars[key_map("VBFJetPhi")],
"eta": vbfJetVars[key_map("VBFJetEta")],
"M": vbfJetVars[key_map("VBFJetMass")],
}
)
vbf1 = vbfjets[:, 0]
vbf2 = vbfjets[:, 1]
jj = vbfjets[:, 0] + vbfjets[:, 1]
ak4away = vector.array(
{
"pt": ak4JetAwayVars["AK4JetAwayPt"],
"phi": ak4JetAwayVars["AK4JetAwayPhi"],
"eta": ak4JetAwayVars["AK4JetAwayEta"],
"M": ak4JetAwayVars["AK4JetAwayMass"],
"pt": ak4JetAwayVars[key_map("AK4JetAwayPt")],
"phi": ak4JetAwayVars[key_map("AK4JetAwayPhi")],
"eta": ak4JetAwayVars[key_map("AK4JetAwayEta")],
"M": ak4JetAwayVars[key_map("AK4JetAwayMass")],
}
)
ak4away1 = ak4away[:, 0]
Expand All @@ -1140,7 +1148,7 @@ def getBDT(
key_map("H1T32"): bbFatJetVars[key_map("bbFatJetTau3OverTau2")][:, 0],
key_map("H2T32"): bbFatJetVars[key_map("bbFatJetTau3OverTau2")][:, 1],
# fatjet mass
key_map("H1Mass"): bbFatJetVars[key_map("bbFatJetPNetMassLegacy")][:, 0],
key_map("H1Mass"): bbFatJetVars[key_map("bbFatJetParTmassVis")][:, 0],
# fatjet kinematics
key_map("H1Pt"): h1.pt,
key_map("H2Pt"): h2.pt,
Expand Down
2 changes: 1 addition & 1 deletion src/HH4b/processors/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def get_ak8jets(fatjets: FatJetArray):
fatjets["ParTPXtauhtaum"] = fatjets.globalParT_Xtauhtaum
# T for discriminator
fatjets["ParTTXbb"] = fatjets.globalParT_XbbVsQCD
# Mass Regression
# Mass Regression (Raw)
fatjets["ParTmassRes"] = fatjets.globalParT_massRes * (1 - fatjets.rawFactor) * fatjets.mass
fatjets["ParTmassVis"] = fatjets.globalParT_massVis * (1 - fatjets.rawFactor) * fatjets.mass

Expand Down
1 change: 0 additions & 1 deletion src/HH4b/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ def load_samples(
try:
events = pd.read_parquet(parquet_path, filters=filters, columns=load_columns)
except Exception:
events = pd.read_parquet(parquet_path, filters=filters, columns=load_columns)
warnings.warn(
f"Can't read file with requested columns/filters for {sample}!", stacklevel=1
)
Expand Down
Loading