hitsz-ids · wunder957 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/NOTICE b/NOTICE
@@ -0,0 +1 @@
+Reference is made to the implementation of CTGAN (https://github.com/sdv-dev/CTGAN) version 0.6.0, which is licensed under the MIT LICENSE. Our modifications are under the Apache 2.0 LICNESE.
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "pandas",
     "numpy",
     "scikit-learn",
-    "torch",
+    "torch>=2",
     "torchvision",
     "rdt",
     "joblib",

diff --git a/sdgx/data_process/transform/transformer_opt.py b/sdgx/data_process/transform/transformer_opt.py
@@ -181,15 +181,12 @@ def _transform_continuous(self, column_transform_info, data):
         column_name = data.columns[0]
         flattened_column = data[column_name].to_numpy().flatten()
         data = data.assign(**{column_name: flattened_column})
-        gm = column_transform_info.transform
-        transformed = gm.transform(data)
-
-        #  Converts the transformed data to the appropriate output format.
-        #  The first column (ending in '.normalized') stays the same,
-        #  but the lable encoded column (ending in '.component') is one hot encoded.
-        output = np.zeros((len(transformed), column_transform_info.output_dimensions))
-        output[:, 0] = transformed[f"{column_name}.normalized"].to_numpy()
-        index = transformed[f"{column_name}.component"].to_numpy().astype(int)
+        transformed_data = column_transform_info.transform.transform(data)
+
+        output_dimensions = column_transform_info.output_dimensions
+        output = np.zeros((len(transformed_data), output_dimensions))
+        output[:, 0] = transformed_data[f"{column_name}.normalized"].to_numpy()
+        index = transformed_data[f"{column_name}.component"].to_numpy().astype(int)
         output[np.arange(index.size), index + 1] = 1.0
 
         return output

diff --git a/sdgx/models/base.py b/sdgx/models/base.py
@@ -1,3 +1,9 @@
+# Refer CTGAN Version 0.6.0: https://github.com/sdv-dev/CTGAN@a40570e321cb46d798a823f350e1010a0270d804
+# Which is Lincensed by MIT License
+
+
+import os
+from copy import deepcopy
 from typing import List, Optional
 
 import numpy as np
@@ -29,28 +35,34 @@ def __getstate__(self):
         self.set_device(torch.device("cpu"))
         state = self.__dict__.copy()
         self.set_device(device_backup)
+
+        random_states = self.random_states
         if (
-            isinstance(self.random_states, tuple)
-            and isinstance(self.random_states[0], np.random.RandomState)
-            and isinstance(self.random_states[1], torch.Generator)
+            isinstance(random_states, tuple)
+            and isinstance(random_states[0], np.random.RandomState)
+            and isinstance(random_states[1], torch.Generator)
         ):
-            state["_numpy_random_state"] = self.random_states[0].get_state()
-            state["_torch_random_state"] = self.random_states[1].get_state()
-            state.pop("random_states")
+            state["_numpy_random_state"] = random_states[0].get_state()
+            state["_torch_random_state"] = random_states[1].get_state()
+            del state["random_states"]
+
         return state
 
     def __setstate__(self, state):
-        if "_numpy_random_state" in state and "_torch_random_state" in state:
-            np_state = state.pop("_numpy_random_state")
-            torch_state = state.pop("_torch_random_state")
+        np_state = state.pop("_numpy_random_state", None)
+        torch_state = state.pop("_torch_random_state", None)
+        if np_state is not None and torch_state is not None:
             current_torch_state = torch.Generator()
             current_torch_state.set_state(torch_state)
             current_numpy_state = np.random.RandomState()
             current_numpy_state.set_state(np_state)
             state["random_states"] = (current_numpy_state, current_torch_state)
         self.__dict__ = state
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self.set_device(device)
+        # FIXME: Extrace it into a config file
+        if not os.getenv("SDG_FORCE_LOAD_CPU"):
+            # Prefer cuda if not specified
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            self.set_device(device)
 
     def save(self, path):
         device_backup = self._device

diff --git a/sdgx/models/single_table/ctgan.py b/sdgx/models/single_table/ctgan.py
@@ -1,3 +1,6 @@
+# Refer CTGAN Version 0.6.0: https://github.com/sdv-dev/CTGAN@a40570e321cb46d798a823f350e1010a0270d804
+# Which is Lincensed by MIT License
+
 import warnings
 from typing import Any, List, Optional
 
@@ -219,31 +222,10 @@ def __init__(
 
     @staticmethod
     def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
-        """Deals with the instability of the gumbel_softmax for older versions of torch.
-
-        For more details about the issue:
-        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing
-
-        Args:
-            logits […, num_features]:
-                Unnormalized log probabilities
-            tau:
-                Non-negative scalar temperature
-            hard (bool):
-                If True, the returned samples will be discretized as one-hot vectors,
-                but will be differentiated as if it is the soft sample in autograd
-            dim (int):
-                A dimension along which softmax will be computed. Default: -1.
-
-        Returns:
-            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
         """
-        for _ in range(10):
-            transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
-            if not torch.isnan(transformed).any():
-                return transformed
-
-        raise ValueError("gumbel_softmax returning NaN.")
+        For compatibility, gumbelsoftmax is stable now: https://github.com/pytorch/pytorch/issues/41663
+        """
+        return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
 
     def _apply_activate(self, data):
         """Apply proper activation function to the output of the generator."""

diff --git a/sdgx/statistics/base.py b/sdgx/statistics/base.py
@@ -1,3 +1,8 @@
+# Refer CTGAN Version 0.6.0: https://github.com/sdv-dev/CTGAN@a40570e321cb46d798a823f350e1010a0270d804
+# Which is Lincensed by MIT License
+
+import os
+from copy import deepcopy
 from typing import List, Optional
 
 import numpy as np
@@ -39,18 +44,39 @@ def __getstate__(self):
             state.pop("random_states")
         return state
 
+    def __getstate__(self):
+        device_backup = self._device
+        self.set_device(torch.device("cpu"))
+        state = self.__dict__.copy()
+        self.set_device(device_backup)
+
+        random_states = self.random_states
+        if (
+            isinstance(random_states, tuple)
+            and isinstance(random_states[0], np.random.RandomState)
+            and isinstance(random_states[1], torch.Generator)
+        ):
+            state["_numpy_random_state"] = random_states[0].get_state()
+            state["_torch_random_state"] = random_states[1].get_state()
+            del state["random_states"]
+
+        return state
+
     def __setstate__(self, state):
-        if "_numpy_random_state" in state and "_torch_random_state" in state:
-            np_state = state.pop("_numpy_random_state")
-            torch_state = state.pop("_torch_random_state")
+        np_state = state.pop("_numpy_random_state", None)
+        torch_state = state.pop("_torch_random_state", None)
+        if np_state is not None and torch_state is not None:
             current_torch_state = torch.Generator()
             current_torch_state.set_state(torch_state)
             current_numpy_state = np.random.RandomState()
             current_numpy_state.set_state(np_state)
             state["random_states"] = (current_numpy_state, current_torch_state)
         self.__dict__ = state
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self.set_device(device)
+        # FIXME: Extrace it into a config file
+        if not os.getenv("SDG_FORCE_LOAD_CPU"):
+            # Prefer cuda if not specified
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            self.set_device(device)
 
     def save(self, path):
         device_backup = self._device