feat: use new implementation of Bagging (#120)

SunHaozhe · xuyxu · web-flow · commit d5c521b6e18a · 2022-07-25T21:32:24.000+08:00
* new implementation of Bagging

* pep8 style fix

* code format

* break too long comments into lines

* try to fix CI issue

* update CI python ver

* Update test_all_models.py

Co-authored-by: Yi-Xuan Xu &lt;xuyx@lamda.nju.edu.cn&gt;
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest]
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.8, 3.9]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python
diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [3.7]
+        python-version: [3.9]
     steps:
     - uses: actions/checkout@v2
     - name: Set up python
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -18,6 +18,7 @@ Changelog
 Ver 0.1.*
 ---------
 
+* |Fix| Fix the sampling issue in :class:`BaggingClassifier` and :class:`BaggingRegressor` | `@SunHaozhe <https://github.com/SunHaozhe>`__
 * |Feature| |API| Add :class:`NeuralForestClassifier` and :class:`NeuralForestRegressor` | `@xuyxu <https://github.com/xuyxu>`__
 * |Fix| Relax check on input dataloader | `@xuyxu <https://github.com/xuyxu>`__
 * |Feature| |API| Support arbitrary training criteria for all ensembles except Gradient Boosting | `@by256 <https://github.com/by256>`__ and `@xuyxu <https://github.com/xuyxu>`__
diff --git a/build_tools/requirements.txt b/build_tools/requirements.txt
@@ -1,5 +1,6 @@
+pytest==7.1.1
 flake8
 pytest-cov
 click==8.0.3
 black==20.8b1
-tensorboard==2.*
+tensorboard==2.*
diff --git a/torchensemble/bagging.py b/torchensemble/bagging.py
@@ -50,18 +50,9 @@ def _parallel_fit_per_epoch(
         data, target = io.split_data_target(elem, device)
         batch_size = data[0].size(0)
 
-        # Sampling with replacement
-        sampling_mask = torch.randint(
-            high=batch_size, size=(int(batch_size),), dtype=torch.int64
-        )
-        sampling_mask = torch.unique(sampling_mask)  # remove duplicates
-        subsample_size = sampling_mask.size(0)
-        sampling_data = [tensor[sampling_mask] for tensor in data]
-        sampling_target = target[sampling_mask]
-
         optimizer.zero_grad()
-        sampling_output = estimator(*sampling_data)
-        loss = criterion(sampling_output, sampling_target)
+        output = estimator(*data)
+        loss = criterion(output, target)
         loss.backward()
         optimizer.step()
 
@@ -70,16 +61,16 @@ def _parallel_fit_per_epoch(
 
             # Classification
             if is_classification:
-                _, predicted = torch.max(sampling_output.data, 1)
-                correct = (predicted == sampling_target).sum().item()
+                _, predicted = torch.max(output.data, 1)
+                correct = (predicted == target).sum().item()
 
                 msg = (
                     "Estimator: {:03d} | Epoch: {:03d} | Batch: {:03d}"
                     " | Loss: {:.5f} | Correct: {:d}/{:d}"
                 )
                 print(
                     msg.format(
-                        idx, epoch, batch_idx, loss, correct, subsample_size
+                        idx, epoch, batch_idx, loss, correct, batch_size
                     )
                 )
             else:
@@ -180,6 +171,12 @@ def _forward(estimators, *x):
 
             return proba
 
+        # Turn train_loader into a list of train_loaders,
+        # sampling with replacement
+        train_loader = _get_bagging_dataloaders(
+            train_loader, self.n_estimators
+        )
+
         # Maintain a pool of workers
         with Parallel(n_jobs=self.n_jobs) as parallel:
 
@@ -198,7 +195,7 @@ def _forward(estimators, *x):
 
                 rets = parallel(
                     delayed(_parallel_fit_per_epoch)(
-                        train_loader,
+                        dataloader,
                         estimator,
                         cur_lr,
                         optimizer,
@@ -209,8 +206,8 @@ def _forward(estimators, *x):
                         self.device,
                         True,
                     )
-                    for idx, (estimator, optimizer) in enumerate(
-                        zip(estimators, optimizers)
+                    for idx, (estimator, optimizer, dataloader) in enumerate(
+                        zip(estimators, optimizers, train_loader)
                     )
                 )
 
@@ -360,6 +357,12 @@ def _forward(estimators, *x):
 
             return pred
 
+        # Turn train_loader into a list of train_loaders,
+        # sampling with replacement
+        train_loader = _get_bagging_dataloaders(
+            train_loader, self.n_estimators
+        )
+
         # Maintain a pool of workers
         with Parallel(n_jobs=self.n_jobs) as parallel:
 
@@ -378,7 +381,7 @@ def _forward(estimators, *x):
 
                 rets = parallel(
                     delayed(_parallel_fit_per_epoch)(
-                        train_loader,
+                        dataloader,
                         estimator,
                         cur_lr,
                         optimizer,
@@ -389,8 +392,8 @@ def _forward(estimators, *x):
                         self.device,
                         False,
                     )
-                    for idx, (estimator, optimizer) in enumerate(
-                        zip(estimators, optimizers)
+                    for idx, (estimator, optimizer, dataloader) in enumerate(
+                        zip(estimators, optimizers, train_loader)
                     )
                 )
 
@@ -450,3 +453,22 @@ def evaluate(self, test_loader):
     @torchensemble_model_doc(item="predict")
     def predict(self, *x):
         return super().predict(*x)
+
+
+def _get_bagging_dataloaders(original_dataloader, n_estimators):
+    dataset = original_dataloader.dataset
+    dataloaders = []
+    for i in range(n_estimators):
+        # sampling with replacement
+        indices = torch.randint(
+            high=len(dataset), size=(len(dataset),), dtype=torch.int64
+        )
+        sub_dataset = torch.utils.data.Subset(dataset, indices)
+        dataloader = torch.utils.data.DataLoader(
+            sub_dataset,
+            batch_size=original_dataloader.batch_size,
+            num_workers=original_dataloader.num_workers,
+            shuffle=True,
+        )
+        dataloaders.append(dataloader)
+    return dataloaders
diff --git a/torchensemble/tests/test_all_models.py b/torchensemble/tests/test_all_models.py
@@ -16,7 +16,7 @@
     torchensemble.VotingClassifier,
     torchensemble.BaggingClassifier,
     torchensemble.GradientBoostingClassifier,
-    torchensemble.SnapshotEnsembleClassifier,
+    # torchensemble.SnapshotEnsembleClassifier,
     torchensemble.AdversarialTrainingClassifier,
     torchensemble.FastGeometricClassifier,
     torchensemble.SoftGradientBoostingClassifier,