optimized pytorch geometric pipeline using DDP (#118)

* update the pytorch pipeline
jpata · Jul 1, 2022 · 950ce04 · 950ce04
1 parent 2b21fa2
commit 950ce04
Show file tree

Hide file tree

Showing 33 changed files with 3,526 additions and 2,232 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,21 +4,18 @@
 *.pt
 *.pdf
 data/*
+experiments/*
+prp/*
 *.pth
 test/__pycache__/
 
-mlpf/pytorch/__pycache__/*
-mlpf/plotting/__pycache__/*
-mlpf/pytorch/data
-test_tmp/
-test_tmp_delphes/
+*/__pycache__/*
 .DS_Store
 
-prp
 *.pyc
 *.pyo
 
-mlpf/updated/LRP/pid*
-mlpf/updated/LRP/class*
-
 *.ipynb_checkpoints
+
+*playground.py
+nohup.out
diff --git a/mlpf/lrp/lrp_mlpf.py b/mlpf/lrp/lrp_mlpf.py
@@ -14,14 +14,15 @@
 class LRP_MLPF():
 
     """
-    A class that act on graph datasets and GNNs based on the Gravnet layer (e.g. the MLPF model)
-    The main trick is to realize that the ".lin_s" layers in Gravnet are irrelevant for explanations so shall be skipped
-    The hack, however, is to substitute them precisely with the message_passing step
+    A class that introduces useful functionality to perform layerwise-relevance propagation (LRP) on MLPF.
+    This class is meant to work for any model with any number of GravNetConv layers.
+    The main trick is to realize that the ".lin_s" layers in GravNetConv are irrelevant for explanations so shall be skipped.
+    The hack, however, is to substitute them precisely with the message_passing step.
 
     Differences from standard LRP
-        - Rscores become tensors/graphs of input features per output neuron instead of vectors
-        - accomodates message passing steps by using the adjacency matrix as the weight matrix in standard LRP,
-          and redistributing Rscores over the other dimension (over nodes instead of features)
+        a. Rscores become tensors/graphs of input features per output neuron instead of vectors
+        b. accomodates message passing steps by using the adjacency matrix as the weight matrix in standard LRP,
+           and redistributing Rscores over the other dimension (over nodes instead of features)
     """
 
     def __init__(self, device, model, epsilon):
@@ -153,8 +154,8 @@ def eps_rule(self, layer, layer_name, x, R_tensor_old, neuron_to_explain, msg_pa
 
         Can accomodate message_passing layers if the adjacency matrix and the activations before the message_passing are provided.
         The trick (or as we like to call it, the message_passing hack) is in
-            (1) using the adjacency matrix as the weight matrix in the standard lrp rule
-            (2) transposing the activations to distribute the Rscores over the other dimension (over nodes instead of features)
+            a. using the adjacency matrix as the weight matrix in the standard lrp rule
+            b. transposing the activations to distribute the Rscores over the other dimension (over nodes instead of features)
 
         Args:
             layer: a torch.nn module with a corresponding weight matrix W

diff --git a/mlpf/lrp/model.py b/mlpf/lrp/model.py
@@ -103,51 +103,11 @@ def forward(self, batch):
 
 class GravNetConv_LRP(MessagePassing):
     """
-    Copied from pytorch_geometric source code
-    Edits:
-      - retrieve adjacency matrix (we call A), and the activations before the message passing step (we call msg_activations)
-      - switched the execution of self.lin_s & self.lin_p so that the message passing step can substitute out of the box self.lin_s for lrp purposes
-      - used reduce='sum' instead of reduce='mean' in the message passing
-      - removed skip connection
-
-    The GravNet operator from the `"Learning Representations of Irregular
-    Particle-detector Geometry with Distance-weighted Graph
-    Networks" <https://arxiv.org/abs/1902.07987>`_ paper, where the graph is
-    dynamically constructed using nearest neighbors.
-    The neighbors are constructed in a learnable low-dimensional projection of
-    the feature space.
-    A second projection of the input feature space is then propagated from the
-    neighbors to each vertex using distance weights that are derived by
-    applying a Gaussian function to the distances.
-
-    Args:
-        in_channels (int): Size of each input sample, or :obj:`-1` to derive
-            the size from the first input(s) to the forward method.
-        out_channels (int): The number of output channels.
-        space_dimensions (int): The dimensionality of the space used to
-           construct the neighbors; referred to as :math:`S` in the paper.
-        propagate_dimensions (int): The number of features to be propagated
-           between the vertices; referred to as :math:`F_{\textrm{LR}}` in the
-           paper.
-        k (int): The number of nearest neighbors.
-        num_workers (int): Number of workers to use for k-NN computation.
-            Has no effect in case :obj:`batch` is not :obj:`None`, or the input
-            lies on the GPU. (default: :obj:`1`)
-        **kwargs (optional): Additional arguments of
-            :class:`torch_geometric.nn.conv.MessagePassing`.
-
-    Shapes:
-        - **input:**
-          node features :math:`(|\mathcal{V}|, F_{in})` or
-          :math:`((|\mathcal{V_s}|, F_{in}), (|\mathcal{V_t}|, F_{in}))`
-          if bipartite,
-          batch vector :math:`(|\mathcal{V}|)` or
-          :math:`((|\mathcal{V}_s|), (|\mathcal{V}_t|))` if bipartite
-          *(optional)*
-        - **output:** node features :math:`(|\mathcal{V}|, F_{out})` or
-          :math:`(|\mathcal{V}_t|, F_{out})` if bipartite
-
-
+    Copied from pytorch_geometric source code, with the following edits
+        a. used reduce='sum' instead of reduce='mean' in the message passing
+        b. removed skip connection
+        c. retrieved adjacency matrix and the activations before the message passing, both are useful only for LRP purposes
+        d. switched the execution of self.lin_s & self.lin_p so that the message passing step can substitute out of the box self.lin_s for lrp purposes
     """
 
     def __init__(self, in_channels: int, out_channels: int,

diff --git a/mlpf/lrp_pipeline.py → mlpf/lrp_mlpf_pipeline.py b/mlpf/lrp_pipeline.py → mlpf/lrp_mlpf_pipeline.py
@@ -1,25 +1,18 @@
-from pytorch_delphes import PFGraphDataset, dataloader_qcd, load_model
+from pyg import PFGraphDataset, dataloader_qcd, load_model
 from lrp import MLPF, LRP_MLPF, make_Rmaps
+
 import argparse
 import pickle as pkl
 import os.path as osp
 import os
 import sys
-from glob import glob
 
 import numpy as np
 import mplhep as hep
 import pandas as pd
 
 import torch
 import torch_geometric
-from torch_geometric.nn import GravNetConv
-
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Sequential as Seq, Linear as Lin, ReLU
-from sklearn.metrics import accuracy_score
-import matplotlib.pyplot as plt
 from torch_geometric.data import Data, DataLoader, DataListLoader, Batch
 
 
@@ -28,27 +21,20 @@
 parser = argparse.ArgumentParser()
 
 # for saving the model
-parser.add_argument("--dataset_qcd",    type=str,           default='../data/test_tmp_delphes/data/pythia8_qcd',   help="testing dataset path")
-parser.add_argument("--outpath",        type=str,           default='../data/test_tmp_delphes/experiments/',       help="path to the trained model directory")
-parser.add_argument("--load_model",     type=str,           default="",     help="Which model to load")
-parser.add_argument("--load_epoch",     type=int,           default=0,      help="Which epoch of the model to load")
-parser.add_argument("--out_neuron",     type=int,           default=0,      help="the output neuron you wish to explain")
-parser.add_argument("--pid",            type=str,           default="chhadron",     help="Which model to load")
-parser.add_argument("--n_test",         type=int,           default=50,      help="number of data files to use for testing.. each file contains 100 events")
+parser.add_argument("--dataset_qcd",    type=str,           default='../data/delphes/pythia8_qcd',   help="testing dataset path")
+parser.add_argument("--outpath",        type=str,           default='../experiments/',               help="path to the trained model directory")
+parser.add_argument("--load_model",     type=str,           default="",          help="Which model to load")
+parser.add_argument("--load_epoch",     type=int,           default=0,           help="Which epoch of the model to load")
+parser.add_argument("--out_neuron",     type=int,           default=0,           help="the output neuron you wish to explain")
+parser.add_argument("--pid",            type=str,           default="chhadron",  help="Which model to load")
+parser.add_argument("--n_test",         type=int,           default=50,          help="number of data files to use for testing.. each file contains 100 events")
 parser.add_argument("--run_lrp",        dest='run_lrp',     action='store_true', help="runs lrp")
 parser.add_argument("--make_rmaps",     dest='make_rmaps',  action='store_true', help="makes rmaps")
 
 args = parser.parse_args()
 
 
 if __name__ == "__main__":
-    """
-    e.g. to run lrp and make Rmaps
-    python -u lrp_pipeline.py --run_lrp --make_rmaps --load_model='MLPF_gen_ntrain_1_nepochs_1_clf_reg' --load_epoch=0 --n_test=1 --pid='chhadron'
-
-    e.g. to only make Rmaps
-    python -u lrp_pipeline.py --make_rmaps --load_model='MLPF_gen_ntrain_1_nepochs_1_clf_reg' --load_epoch=0 --n_test=1 --out_neuron=0 --pid='chhadron'
-    """
 
     if args.run_lrp:
         # Check if the GPU configuration and define the global base device
@@ -71,24 +57,26 @@
         model = MLPF(**model_kwargs)
         model.load_state_dict(state_dict)
         model.to(device)
+        model.eval()
 
-        # run lrp
+        # initialize placeholders for Rscores, the event inputs, and the event predictions
         Rtensors_list, preds_list, inputs_list = [], [], []
 
+        # define the lrp instance
+        lrp_instance = LRP_MLPF(device, model, epsilon=1e-9)
+
+        # loop over events to explain them
         for i, event in enumerate(loader):
             print(f'Explaining event # {i}')
 
-            # run lrp on sample model
-            model.eval()
-            lrp_instance = LRP_MLPF(device, model, epsilon=1e-9)
+            # run lrp on the event
             Rtensor, pred, input = lrp_instance.explain(event, neuron_to_explain=args.out_neuron)
 
+            # store the Rscores, the event inputs, and the event predictions
             Rtensors_list.append(Rtensor.detach().to('cpu'))
             preds_list.append(pred.detach().to('cpu'))
             inputs_list.append(input.detach().to('cpu').to_dict())
 
-            break
-
         with open(f'{outpath}/Rtensors_list.pkl', 'wb') as f:
             pkl.dump(Rtensors_list, f)
         with open(f'{outpath}/inputs_list.pkl', 'wb') as f:

diff --git a/mlpf/pytorch_delphes/graph_data_delphes.py → mlpf/pyg/PFGraphDataset.py b/mlpf/pytorch_delphes/graph_data_delphes.py → mlpf/pyg/PFGraphDataset.py
@@ -1,3 +1,13 @@
+try:
+    from pyg.cms_utils import prepare_data_cms
+except:
+    from cms_utils import prepare_data_cms
+
+from numpy.lib.recfunctions import append_fields
+import bz2
+import h5py
+import pandas
+import pandas as pd
 import numpy as np
 import os
 import os.path as osp
@@ -10,23 +20,6 @@
 import pickle
 import multiprocessing
 
-# assumes pkl files exist in /test_tmp_delphes/data/pythia8_ttbar/raw
-# they are processed and saved as pt files in /test_tmp_delphes/data/pythia8_ttbar/processed
-# PFGraphDataset -> returns for 1 event: Data(x=[5139, 12], ycand=[5139, 6], ycand_id=[5139, 6], ygen=[5139, 6], ygen_id=[5139, 6])
-
-
-def one_hot_embedding(labels, num_classes):
-    """
-    Embedding labels to one-hot form.
-    Args:
-      labels: (LongTensor) class labels, sized [N,].
-      num_classes: (int) number of classes.
-    Returns:
-      (tensor) encoded labels, sized [N, #classes].
-    """
-    y = torch.eye(num_classes)
-    return y[labels]
-
 
 def process_func(args):
     self, fns, idx_file = args
@@ -46,9 +39,10 @@ class PFGraphDataset(Dataset):
         root (str): path
     """
 
-    def __init__(self, root, transform=None, pre_transform=None):
+    def __init__(self, root, data, transform=None, pre_transform=None):
         super(PFGraphDataset, self).__init__(root, transform, pre_transform)
         self._processed_dir = Dataset.processed_dir.fget(self)
+        self.data = data
 
     @property
     def raw_file_names(self):
@@ -79,41 +73,40 @@ def download(self):
         pass
 
     def process_single_file(self, raw_file_name):
-        with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
-            data = pickle.load(fi, encoding='iso-8859-1')
-
-        x = []
-        ygen = []
-        ycand = []
-        d = []
-        batch_data = []
-        ygen_id = []
-        ycand_id = []
-
-        for i in range(len(data['X'])):
-            x.append(torch.tensor(data['X'][i], dtype=torch.float))
-            ygen.append(torch.tensor(data['ygen'][i], dtype=torch.float))
-            ycand.append(torch.tensor(data['ycand'][i], dtype=torch.float))
-
-            # one-hot encoding the first element in ygen & ycand (which is the PID) and store it in ygen_id & ycand_id
-            ygen_id.append(ygen[i][:, 0])
-            ycand_id.append(ycand[i][:, 0])
-
-            ygen_id[i] = ygen_id[i].long()
-            ycand_id[i] = ycand_id[i].long()
-
-            ygen_id[i] = one_hot_embedding(ygen_id[i], 6)
-            ycand_id[i] = one_hot_embedding(ycand_id[i], 6)
-
-            # remove from ygen & ycand the first element (PID) so that they only contain the regression variables
-            d = Data(
-                x=x[i],
-                ygen=ygen[i][:, 1:], ygen_id=ygen_id[i],
-                ycand=ycand[i][:, 1:], ycand_id=ycand_id[i]
-            )
-
-            batch_data.append(d)
-        return batch_data
+        """
+        Loads a list of 100 events from a pkl file and generates pytorch geometric Data() objects and stores them in .pt format.
+        For cms data, each element is assumed to be a dict('Xelem', 'ygen', ycand') of numpy rec_arrays with the first element in ygen/ycand is the pid
+        For delphes data, each element is assumed to be a dict('X', 'ygen', ycand') of numpy standard arrays with the first element in ygen/ycand is the pid
+
+        Args
+            raw_file_name: a pkl file
+        Returns
+            batched_data: a list of Data() objects of the form
+             cms ~ Data(x=[#elem, 41], ygen=[#elem, 6], ygen_id=[#elem, 9], ycand=[#elem, 6], ycand_id=[#elem, 9])
+             delphes ~ Data(x=[#elem, 12], ygen=[#elem, 6], ygen_id=[#elem, 6], ycand=[#elem, 6], ycand_id=[#elem, 6])
+        """
+
+        if self.data == 'cms':
+            return prepare_data_cms(osp.join(self.raw_dir, raw_file_name))
+
+        elif self.data == 'delphes':
+            # load the data pkl file
+            with open(osp.join(self.raw_dir, raw_file_name), "rb") as fi:
+                data = pickle.load(fi, encoding='iso-8859-1')
+
+            for i in range(len(data['X'])):
+                # remove from ygen & ycand the first element (PID) so that they only contain the regression variables
+                d = Data(
+                    x=torch.tensor(data['X'][i], dtype=torch.float),
+                    ygen=torch.tensor(data['ygen'][i], dtype=torch.float)[:, 1:],
+                    ygen_id=torch.tensor(data['ygen'][i], dtype=torch.float)[:, 0].long(),
+                    ycand=torch.tensor(data['ycand'][i], dtype=torch.float)[:, 1:],
+                    ycand_id=torch.tensor(data['ycand'][i], dtype=torch.float)[:, 0].long(),
+                )
+
+                batched_data.append(d)
+
+        return batched_data
 
     def process_multiple_files(self, filenames, idx_file):
         datas = [self.process_single_file(fn) for fn in filenames]
@@ -139,7 +132,7 @@ def process_parallel(self, num_files_to_batch, num_proc):
 
     def get(self, idx):
         p = osp.join(self.processed_dir, 'data_{}.pt'.format(idx))
-        data = torch.load(p)
+        data = torch.load(p, map_location='cpu')
         return data
 
     def __getitem__(self, idx):
@@ -149,6 +142,7 @@ def __getitem__(self, idx):
 def parse_args():
     import argparse
     parser = argparse.ArgumentParser()
+    parser.add_argument("--data", type=str, required=True, help="'cms' or 'delphes'?")
     parser.add_argument("--dataset", type=str, required=True, help="Input data path")
     parser.add_argument("--processed_dir", type=str, help="processed", required=False, default=None)
     parser.add_argument("--num-files-merge", type=int, default=10, help="number of files to merge")
@@ -159,12 +153,20 @@ def parse_args():
 
 if __name__ == "__main__":
 
+    """
+    e.g. to run for cms
+    python PFGraphDataset.py --data cms --dataset ../../data/cms/TTbar_14TeV_TuneCUETP8M1_cfi --processed_dir ../../data/cms/TTbar_14TeV_TuneCUETP8M1_cfi/processed --num-files-merge 1 --num-proc 1
+
+    e.g. to run for delphes
+    python3 PFGraphDataset.py --data delphes --dataset $sample --processed_dir $sample/processed --num-files-merge 1 --num-proc 1
+
+    """
+
     args = parse_args()
 
-    pfgraphdataset = PFGraphDataset(root=args.dataset)
+    pfgraphdataset = PFGraphDataset(root=args.dataset, data=args.data)
 
     if args.processed_dir:
         pfgraphdataset._processed_dir = args.processed_dir
 
     pfgraphdataset.process_parallel(args.num_files_merge, args.num_proc)
-    # pfgraphdataset.process(args.num_files_merge)