cache safely tweaks

Adjusted a lot of memoization to use safer caching approach
wagov · Apr 16, 2024 · 68c1a03 · 68c1a03
1 parent cd88d05
commit 68c1a03
Show file tree

Hide file tree

Showing 11 changed files with 57 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -150,4 +150,5 @@ checklink/cookies.txt
 .pkg
 
 # Quarto
-.quarto
+.quarto
+/.jupyter
diff --git a/build.sh b/build.sh
@@ -3,4 +3,6 @@
 npm ci
 npm run build
 pip install build==1.2.1
-python -m build
+python -m build
+# To release to pypi run below after a build
+# twine upload --skip-existing dist/*
diff --git a/nbdev_squ/__init__.py b/nbdev_squ/__init__.py
@@ -1 +1 @@
-__version__ = "1.3.4"
+__version__ = "1.3.5"
diff --git a/nbdev_squ/_modidx.py b/nbdev_squ/_modidx.py
@@ -16,8 +16,10 @@
                                'nbdev_squ.api.finalise_query': ('api.html#finalise_query', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.hunt': ('api.html#hunt', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.list_securityinsights': ('api.html#list_securityinsights', 'nbdev_squ/api.py'),
+                               'nbdev_squ.api.list_securityinsights_safe': ('api.html#list_securityinsights_safe', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.list_subscriptions': ('api.html#list_subscriptions', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.list_workspaces': ('api.html#list_workspaces', 'nbdev_squ/api.py'),
+                               'nbdev_squ.api.list_workspaces_safe': ('api.html#list_workspaces_safe', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.loganalytics_query': ('api.html#loganalytics_query', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.query_all': ('api.html#query_all', 'nbdev_squ/api.py'),
                                'nbdev_squ.api.security_alerts': ('api.html#security_alerts', 'nbdev_squ/api.py'),
@@ -26,6 +28,7 @@
             'nbdev_squ.core': { 'nbdev_squ.core._cli': ('core.html#_cli', 'nbdev_squ/core.py'),
                                 'nbdev_squ.core.azcli': ('core.html#azcli', 'nbdev_squ/core.py'),
                                 'nbdev_squ.core.datalake_path': ('core.html#datalake_path', 'nbdev_squ/core.py'),
+                                'nbdev_squ.core.datalake_path_safe': ('core.html#datalake_path_safe', 'nbdev_squ/core.py'),
                                 'nbdev_squ.core.load_config': ('core.html#load_config', 'nbdev_squ/core.py'),
                                 'nbdev_squ.core.login': ('core.html#login', 'nbdev_squ/core.py')},
             'nbdev_squ.legacy': { 'nbdev_squ.legacy.adx_query': ('legacy.html#adx_query', 'nbdev_squ/legacy.py'),

diff --git a/nbdev_squ/api.py b/nbdev_squ/api.py
@@ -1,9 +1,9 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_api.ipynb.
 
 # %% auto 0
-__all__ = ['logger', 'clients', 'columns_of_interest', 'columns', 'Clients', 'list_workspaces', 'list_subscriptions',
-           'list_securityinsights', 'chunks', 'loganalytics_query', 'query_all', 'finalise_query', 'hunt',
-           'atlaskit_transformer', 'security_incidents', 'security_alerts']
+__all__ = ['logger', 'clients', 'columns_of_interest', 'columns', 'Clients', 'list_workspaces_safe', 'list_workspaces',
+           'list_subscriptions', 'list_securityinsights_safe', 'list_securityinsights', 'chunks', 'loganalytics_query',
+           'query_all', 'finalise_query', 'hunt', 'atlaskit_transformer', 'security_incidents', 'security_alerts']
 
 # %% ../nbs/01_api.ipynb 3
 import pandas, json, logging, time, requests, io, pkgutil, httpx_cache
@@ -65,13 +65,20 @@ def tio(self):
 
 # %% ../nbs/01_api.ipynb 13
 @memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours
-def list_workspaces(fmt: str = "df", # df, csv, json, list
+def list_workspaces_safe(fmt: str = "df", # df, csv, json, list
                     agency: str = "ALL"): # Agency alias or ALL
     path = datalake_path()
     df = pandas.read_csv((path / "notebooks/lists/SentinelWorkspaces.csv").open())
     df = df.join(pandas.read_csv((path / "notebooks/lists/SecOps Groups.csv").open()).set_index("Alias"), on="SecOps Group", rsuffix="_secops")
     df = df.rename(columns={"SecOps Group": "alias", "Domains and IPs": "domains"})
     df = df.dropna(subset=["customerId"]).sort_values(by="alias").convert_dtypes().reset_index()
+    persisted = f"{dirs.user_cache_dir}/list_workspaces.parquet"
+    df.to_parquet(persisted)
+    return persisted
+
+def list_workspaces(fmt: str = "df", # df, csv, json, list
+                    agency: str = "ALL"): # Agency alias or ALL
+    df = pandas.read_parquet(list_workspaces_safe(fmt, agency))
     if agency != "ALL":
         df = df[df["alias"] == agency]
     if fmt == "df":
@@ -86,13 +93,12 @@ def list_workspaces(fmt: str = "df", # df, csv, json, list
         raise ValueError("Invalid format")
 
 # %% ../nbs/01_api.ipynb 17
-@memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours
 def list_subscriptions():
     return pandas.DataFrame(azcli(["account", "list"]))["id"].unique()
 
 @memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours
-def list_securityinsights():
-    return pandas.DataFrame(azcli([
+def list_securityinsights_safe():
+    return azcli([
         "graph", "query", "--first", "1000", "-q", 
         """
         resources
@@ -104,14 +110,16 @@ def list_securityinsights():
             on wlid
         | extend customerId = properties.customerId
         """
-    ])["data"])
+    ])["data"]
+
+def list_securityinsights():
+    return pandas.DataFrame(list_securityinsights_safe())
 
 def chunks(items, size):
     # Yield successive `size` chunks from `items`
     for i in range(0, len(items), size):
         yield items[i:i + size]
 
-@memoize_stampede(cache, expire=60 * 5) # cache for 5 mins
 def loganalytics_query(queries: list[str], timespan=pandas.Timedelta("14d"), batch_size=190, batch_delay=32, sentinel_workspaces = None):
     """
     Run queries across all workspaces, in batches of `batch_size` with a minimum delay of `batch_delay` between batches.

diff --git a/nbdev_squ/core.py b/nbdev_squ/core.py
@@ -1,7 +1,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
 
 # %% auto 0
-__all__ = ['logger', 'dirs', 'cache', 'retryer', 'load_config', 'login', 'azcli', 'datalake_path']
+__all__ = ['logger', 'dirs', 'cache', 'retryer', 'load_config', 'login', 'azcli', 'datalake_path_safe', 'datalake_path']
 
 # %% ../nbs/00_core.ipynb 3
 import logging, subprocess, os, sys, pandas
@@ -102,14 +102,18 @@ def azcli(basecmd: list[str]):
 
 # %% ../nbs/00_core.ipynb 16
 @memoize_stampede(cache, expire=60 * 60 * 24)
-def datalake_path(expiry_days: int=3, # Number of days until the SAS token expires
-                  permissions: str="racwdlt" # Permissions to grant on the SAS token
-                    ):
+def datalake_path_safe(expiry_days, permissions):
     if not cache.get("logged_in"): # Have to login to grab keyvault config
         login()
     expiry = pandas.Timestamp("now") + pandas.Timedelta(days=expiry_days)
     account = cache["config"]["datalake_account"].split(".")[0] # Grab the account name, not the full FQDN
     container = cache['config']['datalake_container']
     sas = azcli(["storage", "container", "generate-sas", "--auth-mode", "login", "--as-user",
                  "--account-name", account, "--name", container, "--permissions", permissions, "--expiry", str(expiry.date())])
+    return (container, account, sas)
+
+def datalake_path(expiry_days: int=3, # Number of days until the SAS token expires
+                  permissions: str="racwdlt" # Permissions to grant on the SAS token
+                    ):
+    container, account, sas = datalake_path_safe(expiry_days, permissions)
     return UPath(f"az://{container}", account_name=account, sas_token=sas)
diff --git a/nbdev_squ/legacy.py b/nbdev_squ/legacy.py
@@ -14,7 +14,6 @@
 logger = logging.getLogger(__name__)
 
 # %% ../nbs/02_legacy.ipynb 6
-@memoize_stampede(api.cache, expire=60 * 5) # cache for 5 mins
 def adx_query(kql):
     """
     Run a kusto query

diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -235,16 +235,20 @@
    "source": [
     "#| exports\n",
     "@memoize_stampede(cache, expire=60 * 60 * 24)\n",
-    "def datalake_path(expiry_days: int=3, # Number of days until the SAS token expires\n",
-    "                  permissions: str=\"racwdlt\" # Permissions to grant on the SAS token\n",
-    "                    ):\n",
+    "def datalake_path_safe(expiry_days, permissions):\n",
     "    if not cache.get(\"logged_in\"): # Have to login to grab keyvault config\n",
     "        login()\n",
     "    expiry = pandas.Timestamp(\"now\") + pandas.Timedelta(days=expiry_days)\n",
     "    account = cache[\"config\"][\"datalake_account\"].split(\".\")[0] # Grab the account name, not the full FQDN\n",
     "    container = cache['config']['datalake_container']\n",
     "    sas = azcli([\"storage\", \"container\", \"generate-sas\", \"--auth-mode\", \"login\", \"--as-user\",\n",
     "                 \"--account-name\", account, \"--name\", container, \"--permissions\", permissions, \"--expiry\", str(expiry.date())])\n",
+    "    return (container, account, sas)\n",
+    "\n",
+    "def datalake_path(expiry_days: int=3, # Number of days until the SAS token expires\n",
+    "                  permissions: str=\"racwdlt\" # Permissions to grant on the SAS token\n",
+    "                    ):\n",
+    "    container, account, sas = datalake_path_safe(expiry_days, permissions)\n",
     "    return UPath(f\"az://{container}\", account_name=account, sas_token=sas)"
    ]
   },

diff --git a/nbs/01_api.ipynb b/nbs/01_api.ipynb
@@ -183,13 +183,20 @@
    "source": [
     "#| exports\n",
     "@memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours\n",
-    "def list_workspaces(fmt: str = \"df\", # df, csv, json, list\n",
+    "def list_workspaces_safe(fmt: str = \"df\", # df, csv, json, list\n",
     "                    agency: str = \"ALL\"): # Agency alias or ALL\n",
     "    path = datalake_path()\n",
     "    df = pandas.read_csv((path / \"notebooks/lists/SentinelWorkspaces.csv\").open())\n",
     "    df = df.join(pandas.read_csv((path / \"notebooks/lists/SecOps Groups.csv\").open()).set_index(\"Alias\"), on=\"SecOps Group\", rsuffix=\"_secops\")\n",
     "    df = df.rename(columns={\"SecOps Group\": \"alias\", \"Domains and IPs\": \"domains\"})\n",
     "    df = df.dropna(subset=[\"customerId\"]).sort_values(by=\"alias\").convert_dtypes().reset_index()\n",
+    "    persisted = f\"{dirs.user_cache_dir}/list_workspaces.parquet\"\n",
+    "    df.to_parquet(persisted)\n",
+    "    return persisted\n",
+    "\n",
+    "def list_workspaces(fmt: str = \"df\", # df, csv, json, list\n",
+    "                    agency: str = \"ALL\"): # Agency alias or ALL\n",
+    "    df = pandas.read_parquet(list_workspaces_safe(fmt, agency))\n",
     "    if agency != \"ALL\":\n",
     "        df = df[df[\"alias\"] == agency]\n",
     "    if fmt == \"df\":\n",
@@ -236,13 +243,12 @@
    "outputs": [],
    "source": [
     "#| exports\n",
-    "@memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours\n",
     "def list_subscriptions():\n",
     "    return pandas.DataFrame(azcli([\"account\", \"list\"]))[\"id\"].unique()\n",
     "\n",
     "@memoize_stampede(cache, expire=60 * 60 * 3) # cache for 3 hours\n",
-    "def list_securityinsights():\n",
-    "    return pandas.DataFrame(azcli([\n",
+    "def list_securityinsights_safe():\n",
+    "    return azcli([\n",
     "        \"graph\", \"query\", \"--first\", \"1000\", \"-q\", \n",
     "        \"\"\"\n",
     "        resources\n",
@@ -254,14 +260,16 @@
     "            on wlid\n",
     "        | extend customerId = properties.customerId\n",
     "        \"\"\"\n",
-    "    ])[\"data\"])\n",
+    "    ])[\"data\"]\n",
+    "\n",
+    "def list_securityinsights():\n",
+    "    return pandas.DataFrame(list_securityinsights_safe())\n",
     "\n",
     "def chunks(items, size):\n",
     "    # Yield successive `size` chunks from `items`\n",
     "    for i in range(0, len(items), size):\n",
     "        yield items[i:i + size]\n",
     "\n",
-    "@memoize_stampede(cache, expire=60 * 5) # cache for 5 mins\n",
     "def loganalytics_query(queries: list[str], timespan=pandas.Timedelta(\"14d\"), batch_size=190, batch_delay=32, sentinel_workspaces = None):\n",
     "    \"\"\"\n",
     "    Run queries across all workspaces, in batches of `batch_size` with a minimum delay of `batch_delay` between batches.\n",

diff --git a/nbs/02_legacy.ipynb b/nbs/02_legacy.ipynb
@@ -75,7 +75,6 @@
    "outputs": [],
    "source": [
     "#| exports\n",
-    "@memoize_stampede(api.cache, expire=60 * 5) # cache for 5 mins\n",
     "def adx_query(kql):\n",
     "    \"\"\"\n",
     "    Run a kusto query\n",
@@ -438,14 +437,6 @@
     "# convert to a jira friendly format\n",
     "sentinel_beautify_local(incident.to_dict())"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ef560e4c-4b15-4d91-b09b-31cb1e12f8bd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/settings.ini b/settings.ini
@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = nbdev-squ
 lib_name = nbdev-squ
-version = 1.3.4
+version = 1.3.5
 min_python = 3.10
 license = apache2
 black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
 language = English
 status = 3
 user = adonm
-requirements = tenacity platformdirs universal-pathlib adlfs diskcache azure-cli>=2.58 azure-monitor-query azure-kusto-data atlassian-python-api abuseipdb_wrapper>=0.2 pysigma pandas pyarrow dask python-benedict markdown pytenable httpx_cache msticpy[azsentinel]
+requirements = tenacity platformdirs universal-pathlib adlfs diskcache azure-cli>=2.59 azure-monitor-query azure-kusto-data atlassian-python-api abuseipdb_wrapper>=0.2 pysigma pandas pyarrow dask python-benedict markdown pytenable httpx_cache msticpy[azsentinel]
 readme_nb = index.ipynb
 allowed_metadata_keys = 
 allowed_cell_metadata_keys =