openvinotoolkit · samet-akcay · May 9, 2023 · Mar 24, 2023 · Mar 24, 2023 · Mar 27, 2023
@@ -151,6 +151,7 @@ def get_datamodule(config: DictConfig | ListConfig) -> AnomalibDataModule:
             task=config.dataset.task,
             clip_length_in_frames=config.dataset.clip_length_in_frames,
             frames_between_clips=config.dataset.frames_between_clips,
+            target_frame=config.dataset.target_frame,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             center_crop=center_crop,
             normalization=config.dataset.normalization,
@@ -169,6 +170,7 @@ def get_datamodule(config: DictConfig | ListConfig) -> AnomalibDataModule:
             task=config.dataset.task,
             clip_length_in_frames=config.dataset.clip_length_in_frames,
             frames_between_clips=config.dataset.frames_between_clips,
+            target_frame=config.dataset.target_frame,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             center_crop=center_crop,
             normalization=config.dataset.normalization,
@@ -205,6 +207,7 @@ def get_datamodule(config: DictConfig | ListConfig) -> AnomalibDataModule:
             task=config.dataset.task,
             clip_length_in_frames=config.dataset.clip_length_in_frames,
             frames_between_clips=config.dataset.frames_between_clips,
+            target_frame=config.dataset.target_frame,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             center_crop=center_crop,
             normalization=config.dataset.normalization,

@@ -28,6 +28,7 @@
 from pandas import DataFrame
 
 from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset
+from anomalib.data.base.video import VideoTargetFrame
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import (
     DownloadInfo,
@@ -140,6 +141,7 @@ class AvenueDataset(AnomalibVideoDataset):
         split (Split): Split of the dataset, usually Split.TRAIN or Split.TEST
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
     """
 
     def __init__(
@@ -151,8 +153,9 @@ def __init__(
         split: Split,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
     ) -> None:
-        super().__init__(task, transform, clip_length_in_frames, frames_between_clips)
+        super().__init__(task, transform, clip_length_in_frames, frames_between_clips, target_frame)
 
         self.root = root if isinstance(root, Path) else Path(root)
         self.gt_dir = gt_dir if isinstance(gt_dir, Path) else Path(gt_dir)
@@ -172,6 +175,7 @@ class Avenue(AnomalibVideoDataModule):
         gt_dir (Path | str): Path to the ground truth files
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
         task TaskType): Task type, 'classification', 'detection' or 'segmentation'
         image_size (int | tuple[int, int] | None, optional): Size of the input image.
             Defaults to None.
@@ -198,6 +202,7 @@ def __init__(
         gt_dir: Path | str,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
         task: TaskType = TaskType.SEGMENTATION,
         image_size: int | tuple[int, int] | None = None,
         center_crop: int | tuple[int, int] | None = None,
@@ -241,6 +246,7 @@ def __init__(
             transform=transform_train,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             gt_dir=gt_dir,
             split=Split.TRAIN,
@@ -251,6 +257,7 @@ def __init__(
             transform=transform_eval,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             gt_dir=gt_dir,
             split=Split.TEST,

@@ -28,6 +28,7 @@
 from torch import Tensor
 
 from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset
+from anomalib.data.base.video import VideoTargetFrame
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import (
     DownloadInfo,
@@ -187,6 +188,7 @@ class ShanghaiTechDataset(AnomalibVideoDataset):
         split (Split): Split of the dataset, usually Split.TRAIN or Split.TEST
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
     """
 
     def __init__(
@@ -198,8 +200,9 @@ def __init__(
         split: Split,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
     ):
-        super().__init__(task, transform, clip_length_in_frames, frames_between_clips)
+        super().__init__(task, transform, clip_length_in_frames, frames_between_clips, target_frame)
 
         self.root = root
         self.scene = scene
@@ -219,6 +222,7 @@ class ShanghaiTech(AnomalibVideoDataModule):
         scene (int): Index of the dataset scene (category) in range [1, 13]
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
         task TaskType): Task type, 'classification', 'detection' or 'segmentation'
         image_size (int | tuple[int, int] | None, optional): Size of the input image.
             Defaults to None.
@@ -245,6 +249,7 @@ def __init__(
         scene: int,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
         task: TaskType = TaskType.SEGMENTATION,
         image_size: int | tuple[int, int] | None = None,
         center_crop: int | tuple[int, int] | None = None,
@@ -288,6 +293,7 @@ def __init__(
             transform=transform_train,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             scene=scene,
             split=Split.TRAIN,
@@ -298,6 +304,7 @@ def __init__(
             transform=transform_eval,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             scene=scene,
             split=Split.TEST,

@@ -18,6 +18,7 @@
 from torch import Tensor
 
 from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset
+from anomalib.data.base.video import VideoTargetFrame
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import (
     DownloadInfo,
@@ -155,6 +156,7 @@ class UCSDpedDataset(AnomalibVideoDataset):
         split (str | Split | None): Split of the dataset, usually Split.TRAIN or Split.TEST
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
     """
 
     def __init__(
@@ -166,8 +168,9 @@ def __init__(
         split: Split,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
     ) -> None:
-        super().__init__(task, transform, clip_length_in_frames, frames_between_clips)
+        super().__init__(task, transform, clip_length_in_frames, frames_between_clips, target_frame)
 
         self.root_category = Path(root) / category
         self.split = split
@@ -186,6 +189,7 @@ class UCSDped(AnomalibVideoDataModule):
         category (str): Sub-category of the dataset, e.g. 'bottle'
         clip_length_in_frames (int, optional): Number of video frames in each clip.
         frames_between_clips (int, optional): Number of frames between each consecutive video clip.
+        target_frame (VideoTargetFrame): Specifies the target frame in the video clip, used for ground truth retrieval
         task (TaskType): Task type, 'classification', 'detection' or 'segmentation'
         image_size (int | tuple[int, int] | None, optional): Size of the input image.
             Defaults to None.
@@ -215,6 +219,7 @@ def __init__(
         category: str,
         clip_length_in_frames: int = 1,
         frames_between_clips: int = 1,
+        target_frame: VideoTargetFrame = VideoTargetFrame.LAST,
         task: TaskType = TaskType.SEGMENTATION,
         image_size: int | tuple[int, int] | None = None,
         center_crop: int | tuple[int, int] | None = None,
@@ -258,6 +263,7 @@ def __init__(
             transform=transform_train,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             category=category,
             split=Split.TRAIN,
@@ -268,6 +274,7 @@ def __init__(
             transform=transform_eval,
             clip_length_in_frames=clip_length_in_frames,
             frames_between_clips=frames_between_clips,
+            target_frame=target_frame,
             root=root,
             category=category,
             split=Split.TEST,

@@ -12,6 +12,7 @@
 from omegaconf import DictConfig, ListConfig
 from torch import load
 
+from anomalib.models.ai_vad import AiVad
 from anomalib.models.cfa import Cfa
 from anomalib.models.cflow import Cflow
 from anomalib.models.components import AnomalyModule
@@ -41,6 +42,7 @@
     "ReverseDistillation",
     "Rkde",
     "Stfpm",
+    "AiVad",
 ]
 
 logger = logging.getLogger(__name__)
@@ -92,6 +94,7 @@ def get_model(config: DictConfig | ListConfig) -> AnomalyModule:
         "reverse_distillation",
         "rkde",
         "stfpm",
+        "ai_vad",
     ]
     model: AnomalyModule
 

@@ -0,0 +1,13 @@
+"""Implementatation of the AI-VAD Model.
+
+AI-VAD: Accurate and Interpretable Video Anomaly Detection
+
+Paper https://arxiv.org/pdf/2212.00789.pdf
+"""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from .lightning_model import AiVad, AiVadLightning
+
+__all__ = ["AiVad", "AiVadLightning"]
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.