facebookresearch · rootvisionai · Jul 9, 2023 · Jul 9, 2023
diff --git a/segment_anything/automatic_mask_generator.py b/segment_anything/automatic_mask_generator.py
@@ -48,6 +48,7 @@ def __init__(
         crop_n_points_downscale_factor: int = 1,
         point_grids: Optional[List[np.ndarray]] = None,
         min_mask_region_area: int = 0,
+        preconv_features: bool = False,
         output_mode: str = "binary_mask",
     ) -> None:
         """
@@ -120,7 +121,7 @@ def __init__(
         if min_mask_region_area > 0:
             import cv2  # type: ignore # noqa: F401
 
-        self.predictor = SamPredictor(model)
+        self.predictor = SamPredictor(model, preconv_features=preconv_features)
         self.points_per_batch = points_per_batch
         self.pred_iou_thresh = pred_iou_thresh
         self.stability_score_thresh = stability_score_thresh

diff --git a/segment_anything/modeling/image_encoder.py b/segment_anything/modeling/image_encoder.py
@@ -32,6 +32,7 @@ def __init__(
         use_rel_pos: bool = False,
         rel_pos_zero_init: bool = True,
         window_size: int = 0,
+        preconv_features: bool = False,
         global_attn_indexes: Tuple[int, ...] = (),
     ) -> None:
         """
@@ -54,6 +55,7 @@ def __init__(
         """
         super().__init__()
         self.img_size = img_size
+        self.preconv_features = preconv_features
 
         self.patch_embed = PatchEmbed(
             kernel_size=(patch_size, patch_size),
@@ -111,9 +113,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         for blk in self.blocks:
             x = blk(x)
 
+        x_preconv = x.permute(0, 3, 1, 2)
         x = self.neck(x.permute(0, 3, 1, 2))
 
-        return x
+        if self.preconv_features:
+            return x, x_preconv
+        else:
+            return x
 
 
 class Block(nn.Module):

diff --git a/segment_anything/modeling/sam.py b/segment_anything/modeling/sam.py
@@ -24,9 +24,11 @@ def __init__(
         image_encoder: ImageEncoderViT,
         prompt_encoder: PromptEncoder,
         mask_decoder: MaskDecoder,
+        preconv_features: bool = False,
         pixel_mean: List[float] = [123.675, 116.28, 103.53],
         pixel_std: List[float] = [58.395, 57.12, 57.375],
     ) -> None:
+
         """
         SAM predicts object masks from an image and input prompts.
 
@@ -95,7 +97,7 @@ def forward(
                 to subsequent iterations of prediction.
         """
         input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0)
-        image_embeddings = self.image_encoder(input_images)
+        image_embeddings, _ = self.image_encoder(input_images)
 
         outputs = []
         for image_record, curr_embedding in zip(batched_input, image_embeddings):

diff --git a/segment_anything/predictor.py b/segment_anything/predictor.py
@@ -18,6 +18,7 @@ class SamPredictor:
     def __init__(
         self,
         sam_model: Sam,
+        preconv_features: bool = False
     ) -> None:
         """
         Uses SAM to calculate the image embedding for an image, and then
@@ -28,6 +29,7 @@ def __init__(
         """
         super().__init__()
         self.model = sam_model
+        self.model.image_encoder.preconv_features = preconv_features
         self.transform = ResizeLongestSide(sam_model.image_encoder.img_size)
         self.reset_image()
 
@@ -227,7 +229,7 @@ def predict_torch(
 
         # Predict masks
         low_res_masks, iou_predictions = self.model.mask_decoder(
-            image_embeddings=self.features,
+            image_embeddings=self.features[0] if type(self.features) == tuple else self.features,
             image_pe=self.model.prompt_encoder.get_dense_pe(),
             sparse_prompt_embeddings=sparse_embeddings,
             dense_prompt_embeddings=dense_embeddings,