Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Asset Paths #1312

Merged
merged 28 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
efea959
Asset Paths design doc
jjnesbitt Sep 6, 2022
430392b
Add asset paths models and service
jjnesbitt Sep 9, 2022
444876d
Add asset path regex validator
jjnesbitt Sep 13, 2022
37f723a
Add asset paths ingestion command
jjnesbitt Sep 13, 2022
25dec83
Update serializers, use asset paths in views
jjnesbitt Sep 27, 2022
aef9ba2
Update client to use updated paths endpoint
jjnesbitt Oct 3, 2022
7cee38a
Reorg asset service
jjnesbitt Sep 28, 2022
648c90c
Update tests
jjnesbitt Sep 28, 2022
a64a567
Add asset paths on asset creation
jjnesbitt Sep 30, 2022
767ce77
Update asset path add/delete tests
jjnesbitt Sep 30, 2022
8e1fae6
Update asset paths when assets are updated in API
jjnesbitt Oct 3, 2022
14208cd
Rename asset path service functions
jjnesbitt Oct 3, 2022
5edb35d
Reorg asset paths service
jjnesbitt Oct 3, 2022
0e2116d
Add check constraints against asset path field
jjnesbitt Oct 4, 2022
12c64e1
Reorganize asset paths code
jjnesbitt Oct 11, 2022
09410a2
Move validate_path outside of Asset model
jjnesbitt Oct 11, 2022
0d23ff0
Don't inherit AssetPath models from TimeStampedModel
jjnesbitt Oct 11, 2022
5079062
Protect on deletion for AssetPath.asset
jjnesbitt Oct 11, 2022
0e27f5f
Use NotFound in paths endpoint
jjnesbitt Oct 12, 2022
248a61b
Don't include regex in asset path validation error
jjnesbitt Oct 12, 2022
c171288
Handle asset path copy on version publish
jjnesbitt Oct 17, 2022
4f6849b
Update asset path management command
jjnesbitt Oct 17, 2022
7b5f465
Add comment
jjnesbitt Oct 17, 2022
c095468
Update asset paths publish test
jjnesbitt Oct 17, 2022
f716ae5
Fix swagger endpoint annotation
jjnesbitt Oct 17, 2022
b56eb29
Fix incorrect type hint
jjnesbitt Oct 18, 2022
fd0a644
Allow '=' in asset paths
jjnesbitt Oct 19, 2022
4353b03
Order AssetPath querysets by 'path'
jjnesbitt Oct 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions dandiapi/api/asset_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from __future__ import annotations

import os

from django.db import transaction
from django.db.models import Count, F, QuerySet

from dandiapi.api.models import Asset, AssetPath, AssetPathRelation, Version

####################################################################
# Dandiset and version deletion will cascade to asset path deletion.
# Thus, no explicit action is needed for these.
####################################################################


def extract_paths(path: str) -> list[str]:
nodepaths: list[str] = path.split('/')
for i in range(len(nodepaths))[1:]:
nodepaths[i] = os.path.join(nodepaths[i - 1], nodepaths[i])

return nodepaths


def get_root_paths(version: Version) -> QuerySet[AssetPath]:
"""Return all root paths for a version."""
# Use prefetch_related here instead of select_related,
# as otherwise the resulting join is very large
qs = AssetPath.objects.prefetch_related(
'asset',
'asset__blob',
'asset__embargoed_blob',
'asset__zarr',
)
return qs.filter(version=version).alias(num_parents=Count('parent_links')).filter(num_parents=1)


def get_path_children(path: AssetPath) -> QuerySet[AssetPath]:
"""Get all direct children from an existing path."""
qs = AssetPath.objects.select_related(
'asset',
'asset__blob',
'asset__embargoed_blob',
'asset__zarr',
)
path_ids = (
AssetPathRelation.objects.filter(parent=path, depth=1)
.values_list('child', flat=True)
.distinct()
)
return qs.filter(id__in=path_ids)
jjnesbitt marked this conversation as resolved.
Show resolved Hide resolved


def search_asset_paths(query: str, version: Version) -> QuerySet[AssetPath] | None:
"""Return all direct children of this path, if there are any."""
if not query:
return get_root_paths(version)

# Ensure no trailing slash
fixed_query = query.rstrip('/')

# Retrieve path
path = AssetPath.objects.filter(version=version, path=fixed_query).first()
if path is None:
return None

return get_path_children(path)


# TODO: Make idempotent
@transaction.atomic()
def add_asset_paths(asset: Asset, version: Version):
# Get or create leaf path
leaf, created = AssetPath.objects.get_or_create(path=asset.path, asset=asset, version=version)
if not created:
return

# Create absolute paths (exclude leaf node)
nodepaths = extract_paths(asset.path)[:-1]

# Create nodes
AssetPath.objects.bulk_create(
[AssetPath(path=path, version=version, asset=None) for path in nodepaths],
ignore_conflicts=True,
)

# Retrieve all paths
paths = [*AssetPath.objects.filter(version=version, path__in=nodepaths).order_by('path'), leaf]

# Create relations between paths
links = []
for i in range(len(paths)):
links.extend(
[
AssetPathRelation(parent=paths[i], child=paths[j], depth=j - i)
for j in range(len(paths))[i:]
]
)

# Create objects
AssetPathRelation.objects.bulk_create(links, ignore_conflicts=True)

# Get all relations (including leaf node)
parent_ids = (
AssetPathRelation.objects.filter(child=leaf)
.distinct('parent')
.values_list('parent', flat=True)
)

# Update size + file count
AssetPath.objects.filter(id__in=parent_ids).update(
aggregate_size=F('aggregate_size') + asset.size, aggregate_files=F('aggregate_files') + 1
)


@transaction.atomic()
def delete_asset_paths(asset: Asset, version: Version):
leaf: AssetPath = AssetPath.objects.get(asset=asset, version=version)

# Fetch parents
parent_ids = (
AssetPathRelation.objects.filter(child=leaf)
.distinct('parent')
.values_list('parent', flat=True)
)
parent_paths = AssetPath.objects.filter(id__in=parent_ids)

# Update parents
parent_paths.update(
aggregate_size=F('aggregate_size') - asset.size, aggregate_files=F('aggregate_files') - 1
)

# Ensure integrity
leaf.refresh_from_db()
assert leaf.aggregate_size == 0
assert leaf.aggregate_files == 0

# Delete leaf node and any other paths with no contained files
AssetPath.objects.filter(aggregate_files=0).delete()


@transaction.atomic()
def update_asset_paths(old_asset: Asset, new_asset: Version, version: Version):
jjnesbitt marked this conversation as resolved.
Show resolved Hide resolved
delete_asset_paths(old_asset, version)
add_asset_paths(new_asset, version)


@transaction.atomic()
def add_version_asset_paths(version: Version):
"""Add every asset from a version."""
for asset in version.assets.iterator():
add_asset_paths(asset, version)
12 changes: 12 additions & 0 deletions dandiapi/api/management/commands/ingest_asset_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import djclick as click

from dandiapi.api.asset_paths import add_version_asset_paths
from dandiapi.api.models import Version


@click.command()
def ingest_asset_paths():
for version in Version.objects.iterator():
print(f'Version: {version}')
print(f'\t {version.assets.count()} assets')
add_version_asset_paths(version)
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Generated by Django 4.1.1 on 2022-10-11 19:44

from django.db import migrations, models
import django.db.models.deletion

import dandiapi.api.models.asset


class Migration(migrations.Migration):

dependencies = [
('api', '0037_alter_version_status'),
]

operations = [
migrations.CreateModel(
name='AssetPath',
fields=[
(
'id',
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name='ID'
),
),
('path', models.CharField(max_length=512)),
('aggregate_files', models.PositiveBigIntegerField(default=0)),
('aggregate_size', models.PositiveBigIntegerField(default=0)),
],
),
migrations.CreateModel(
name='AssetPathRelation',
fields=[
(
'id',
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name='ID'
),
),
('depth', models.PositiveIntegerField()),
],
),
migrations.AlterField(
model_name='asset',
name='path',
field=models.CharField(
max_length=512, validators=[dandiapi.api.models.asset.validate_asset_path]
),
),
migrations.AddConstraint(
model_name='asset',
constraint=models.CheckConstraint(
check=models.Q(
('path__regex', '^([A-z0-9(),&\\s#+~_-]?\\/?\\.?[A-z0-9(),&\\s#+~_-])+$')
),
name='asset_path_regex',
),
),
migrations.AddConstraint(
model_name='asset',
constraint=models.CheckConstraint(
check=models.Q(('path__startswith', '/'), _negated=True),
name='asset_path_no_leading_slash',
),
),
migrations.AddField(
model_name='assetpathrelation',
name='child',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='parent_links',
to='api.assetpath',
),
),
migrations.AddField(
model_name='assetpathrelation',
name='parent',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='child_links',
to='api.assetpath',
),
),
migrations.AddField(
model_name='assetpath',
name='asset',
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.PROTECT,
related_name='leaf_paths',
to='api.asset',
),
),
migrations.AddField(
model_name='assetpath',
name='version',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='asset_paths',
to='api.version',
),
),
migrations.AddConstraint(
model_name='assetpathrelation',
constraint=models.UniqueConstraint(
fields=('parent', 'child'), name='unique-relationship'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.CheckConstraint(
check=models.Q(('path__endswith', '/'), _negated=True), name='consistent-slash'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.UniqueConstraint(
fields=('asset', 'version'), name='unique-asset-version'
),
),
migrations.AddConstraint(
model_name='assetpath',
constraint=models.UniqueConstraint(
fields=('version', 'path'), name='unique-version-path'
),
),
]
3 changes: 3 additions & 0 deletions dandiapi/api/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .asset import Asset, AssetBlob, EmbargoedAssetBlob
from .asset_paths import AssetPath, AssetPathRelation
from .dandiset import Dandiset
from .oauth import StagingApplication
from .upload import EmbargoedUpload, Upload
Expand All @@ -8,6 +9,8 @@
__all__ = [
'Asset',
'AssetBlob',
'AssetPath',
'AssetPathRelation',
'Dandiset',
'EmbargoedAssetBlob',
'EmbargoedUpload',
Expand Down
21 changes: 19 additions & 2 deletions dandiapi/api/models/asset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import datetime
import re
from urllib.parse import urlparse, urlunparse
import uuid

from django.conf import settings
from django.contrib.postgres.indexes import HashIndex
from django.core.exceptions import FieldError
from django.core.exceptions import FieldError, ValidationError
from django.core.validators import RegexValidator
from django.db import models
from django.db.models import Q
Expand All @@ -25,6 +26,18 @@
from .dandiset import Dandiset
from .version import Version

ASSET_CHARS_REGEX = r'[A-z0-9(),&\s#+~_-]'
ASSET_PATH_REGEX = fr'^({ASSET_CHARS_REGEX}?\/?\.?{ASSET_CHARS_REGEX})+$'


def validate_asset_path(path: str):
if path.startswith('/'):
raise ValidationError('Path must not begin with /')
if not re.match(ASSET_PATH_REGEX, path):
raise ValidationError('Path improperly formatted')

return path


class BaseAssetBlob(TimeStampedModel):
SHA256_REGEX = r'[0-9a-f]{64}'
Expand Down Expand Up @@ -106,7 +119,7 @@ class Status(models.TextChoices):
INVALID = 'Invalid'

asset_id = models.UUIDField(unique=True, default=uuid.uuid4)
path = models.CharField(max_length=512)
path = models.CharField(max_length=512, validators=[validate_asset_path])
blob = models.ForeignKey(
AssetBlob, related_name='assets', on_delete=models.CASCADE, null=True, blank=True
)
Expand Down Expand Up @@ -145,6 +158,10 @@ class Meta:
name='asset_metadata_has_schema_version',
check=Q(metadata__schemaVersion__isnull=False),
),
models.CheckConstraint(name='asset_path_regex', check=Q(path__regex=ASSET_PATH_REGEX)),
models.CheckConstraint(
name='asset_path_no_leading_slash', check=~Q(path__startswith='/')
),
Comment on lines +161 to +164
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done as two check constraints as it was difficult to create a working/efficient regex that covered both.

]

@property
Expand Down
Loading