From 50a229e0f2ce8e38aed4bf6ed796d69037c748d1 Mon Sep 17 00:00:00 2001
From: Apple <apple@Adithya.local>
Date: Sun, 22 Sep 2019 03:09:03 +0900
Subject: [PATCH 1/2] add multi-gpus torch-summary capability

---
 torchsummary/torchsummary.py | 138 ++++++++++++++++++-----------------
 1 file changed, 73 insertions(+), 65 deletions(-)

diff --git a/torchsummary/torchsummary.py b/torchsummary/torchsummary.py
index cbe18e3..f9e9a27 100644
--- a/torchsummary/torchsummary.py
+++ b/torchsummary/torchsummary.py
@@ -41,75 +41,83 @@ def hook(module, input, output):
         ):
             hooks.append(module.register_forward_hook(hook))
 
-    device = device.lower()
-    assert device in [
-        "cuda",
-        "cpu",
-    ], "Input device is not valid, please specify 'cuda' or 'cpu'"
-
-    if device == "cuda" and torch.cuda.is_available():
-        dtype = torch.cuda.FloatTensor
-    else:
-        dtype = torch.FloatTensor
+    if(isinstance(device, str)):
+      device.lower()
+    # torch parse function that returns an object of type: torch.device. argument can be passed as a torch.device object, a string('cuda:1') or integer device index(1)
+    device=torch._C._nn._parse_to(device)[0]
 
     # multiple inputs to the network
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
     # batch_size of 2 for batchnorm
-    x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
-    # print(type(x[0]))
-
-    # create properties
-    summary = OrderedDict()
-    hooks = []
-
-    # register hook
-    model.apply(register_hook)
-
-    # make a forward pass
-    # print(x.shape)
-    model(*x)
-
-    # remove these hooks
-    for h in hooks:
-        h.remove()
-
-    print("----------------------------------------------------------------")
-    line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
-    print(line_new)
-    print("================================================================")
-    total_params = 0
-    total_output = 0
-    trainable_params = 0
-    for layer in summary:
-        # input_shape, output_shape, trainable, nb_params
-        line_new = "{:>20}  {:>25} {:>15}".format(
-            layer,
-            str(summary[layer]["output_shape"]),
-            "{0:,}".format(summary[layer]["nb_params"]),
-        )
-        total_params += summary[layer]["nb_params"]
-        total_output += np.prod(summary[layer]["output_shape"])
-        if "trainable" in summary[layer]:
-            if summary[layer]["trainable"] == True:
-                trainable_params += summary[layer]["nb_params"]
-        print(line_new)
+    try:
+        if (device == torch.device('cuda')):
+            if(torch.cuda.is_available()):
+                x = [torch.rand(2, *in_size).to('cuda') for in_size in input_size]
+            else:
+                raise Exception("No CUDA-capable device detected.")
+        elif not (device == torch.device('cpu') or device == torch.device('cpu:0')):
+            with torch.cuda.device(device):
+                if(torch.cuda.is_available()):
+                    x = [torch.rand(2, *in_size).to(device) for in_size in input_size]
+    except RuntimeError:
+        raise Exception("Specified device either doesn't exist or is not CUDA-capable. ") from None
+    else:
+        if (device == torch.device('cpu') or device == torch.device('cpu:0')):
+            x = [torch.rand(2, *in_size).to('cpu') for in_size in input_size]
+        # print(type(x[0]))
+
+        # create properties
+        summary = OrderedDict()
+        hooks = []
 
-    # assume 4 bytes/number (float on cuda).
-    total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
-    total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
-    total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
-    total_size = total_params_size + total_output_size + total_input_size
-
-    print("================================================================")
-    print("Total params: {0:,}".format(total_params))
-    print("Trainable params: {0:,}".format(trainable_params))
-    print("Non-trainable params: {0:,}".format(total_params - trainable_params))
-    print("----------------------------------------------------------------")
-    print("Input size (MB): %0.2f" % total_input_size)
-    print("Forward/backward pass size (MB): %0.2f" % total_output_size)
-    print("Params size (MB): %0.2f" % total_params_size)
-    print("Estimated Total Size (MB): %0.2f" % total_size)
-    print("----------------------------------------------------------------")
-    # return summary
+        # register hook
+        model.apply(register_hook)
+
+        # make a forward pass
+        # print(x.shape)
+        model(*x)
+
+        # remove these hooks
+        for h in hooks:
+            h.remove()
+
+        print("----------------------------------------------------------------")
+        line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
+        print(line_new)
+        print("================================================================")
+        total_params = 0
+        total_output = 0
+        trainable_params = 0
+        for layer in summary:
+            # input_shape, output_shape, trainable, nb_params
+            line_new = "{:>20}  {:>25} {:>15}".format(
+                layer,
+                str(summary[layer]["output_shape"]),
+                "{0:,}".format(summary[layer]["nb_params"]),
+            )
+            total_params += summary[layer]["nb_params"]
+            total_output += np.prod(summary[layer]["output_shape"])
+            if "trainable" in summary[layer]:
+                if summary[layer]["trainable"] == True:
+                    trainable_params += summary[layer]["nb_params"]
+            print(line_new)
+
+      # assume 4 bytes/number (float on cuda).
+        total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
+        total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
+        total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
+        total_size = total_params_size + total_output_size + total_input_size
+
+        print("================================================================")
+        print("Total params: {0:,}".format(total_params))
+        print("Trainable params: {0:,}".format(trainable_params))
+        print("Non-trainable params: {0:,}".format(total_params - trainable_params))
+        print("----------------------------------------------------------------")
+        print("Input size (MB): %0.2f" % total_input_size)
+        print("Forward/backward pass size (MB): %0.2f" % total_output_size)
+        print("Params size (MB): %0.2f" % total_params_size)
+        print("Estimated Total Size (MB): %0.2f" % total_size)
+        print("----------------------------------------------------------------")
+        # return summary

From a52f625b1172b58e91585666c013e2baa61d2978 Mon Sep 17 00:00:00 2001
From: Apple <apple@Adithya.local>
Date: Sun, 22 Sep 2019 03:16:03 +0900
Subject: [PATCH 2/2] multi-gpu compatible: torch-summary for cuda:1 and so on

---
 torchsummary/torchsummary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchsummary/torchsummary.py b/torchsummary/torchsummary.py
index f9e9a27..62efe3f 100644
--- a/torchsummary/torchsummary.py
+++ b/torchsummary/torchsummary.py
@@ -104,7 +104,7 @@ def hook(module, input, output):
                     trainable_params += summary[layer]["nb_params"]
             print(line_new)
 
-      # assume 4 bytes/number (float on cuda).
+        # assume 4 bytes/number (float on cuda).
         total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
         total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
         total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))