Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌎 GUID Resolution #92

Merged
merged 2 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions chowda/flows/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,36 @@ def get_batch(self, n):
)['items']

def batch_ingest_page(self, n):
from sqlmodel import Session
from re import search, split

from sqlmodel import Session, select

from chowda.db import engine
from chowda.models import SonyCiAsset
from chowda.models import MediaFile, SonyCiAsset
from chowda.utils import upsert

with Session(engine) as session:
batch = self.get_batch(n)
media = [SonyCiAsset(**asset) for asset in batch]
results = []
for m in media:
results.append(session.execute(upsert(SonyCiAsset, m, ['id'])))
for asset in media:
results.append(session.execute(upsert(SonyCiAsset, asset, ['id'])))
# If it's a GUID
if search('^cpb-aacip-', asset.name):
# Extract the GUID name
guid = split(r'_|\.|-dupe', asset.name)[0]
# Check for existing MediaFile
media_file = session.exec(
select(MediaFile).where(MediaFile.guid == guid)
).first()
if not media_file:
# Create a new MediaFile with the new guid
media_file = MediaFile(guid=guid)
ci_asset = session.get(SonyCiAsset, asset.id)
# Add the asset to the existing MediaFile
media_file.assets.append(ci_asset)
session.add(media_file)

result = sum([r.rowcount for r in results])
session.commit()
log.success(f'Ingested page {n} with {result} assets')
Expand Down
35 changes: 26 additions & 9 deletions chowda/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,49 @@ async def __admin_repr__(self, request: Request):


class MediaFileCollectionLink(SQLModel, table=True):
media_file_id: Optional[int] = Field(
default=None, foreign_key='media_files.id', primary_key=True
media_file_id: Optional[str] = Field(
default=None, foreign_key='media_files.guid', primary_key=True
)
collection_id: Optional[int] = Field(
default=None, foreign_key='collections.id', primary_key=True
)


class MediaFileBatchLink(SQLModel, table=True):
media_file_id: Optional[int] = Field(
default=None, foreign_key='media_files.id', primary_key=True
media_file_id: Optional[str] = Field(
default=None, foreign_key='media_files.guid', primary_key=True
)
batch_id: Optional[int] = Field(
default=None, foreign_key='batches.id', primary_key=True
)


class MediaFileSonyCiAssetLink(SQLModel, table=True):
media_file_id: Optional[str] = Field(
default=None, foreign_key='media_files.guid', primary_key=True
)
sonyci_asset_id: Optional[str] = Field(
default=None, foreign_key='sonyci_assets.id', primary_key=True
)


class MediaFile(SQLModel, table=True):
"""Media file model

Attributes:
id: SonyCi asset id
guid: asset guid
guid: MediaFile GUID
assets: List of SonyCiAssets
collections: List of Collections
batches: List of Batches
clams_events: List of ClamsEvents
"""

__tablename__ = 'media_files'
id: Optional[int] = Field(primary_key=True, default=None)
guid: str = Field(index=True)
guid: Optional[str] = Field(primary_key=True, default=None, index=True)
mmif_json: Dict[str, Any] = Field(sa_column=Column(JSON), default=None)
assets: List['SonyCiAsset'] = Relationship(
back_populates='media_files', link_model=MediaFileSonyCiAssetLink
)
collections: List['Collection'] = Relationship(
back_populates='media_files', link_model=MediaFileCollectionLink
)
Expand Down Expand Up @@ -127,6 +141,9 @@ class SonyCiAsset(SQLModel, table=True):
thumbnails: Optional[List[Dict[str, Any]]] = Field(
sa_column=Column(postgresql.ARRAY(JSON)), default=None
)
media_files: List[MediaFile] = Relationship(
back_populates='assets', link_model=MediaFileSonyCiAssetLink
)


class Collection(SQLModel, table=True):
Expand Down Expand Up @@ -217,7 +234,7 @@ class ClamsEvent(SQLModel, table=True):
batch: Optional[Batch] = Relationship(back_populates='clams_events')
clams_app_id: Optional[int] = Field(default=None, foreign_key='clams_apps.id')
clams_app: Optional[ClamsApp] = Relationship(back_populates='clams_events')
media_file_id: Optional[int] = Field(default=None, foreign_key='media_files.id')
media_file_id: Optional[str] = Field(default=None, foreign_key='media_files.guid')
media_file: Optional[MediaFile] = Relationship(back_populates='clams_events')

async def __admin_repr__(self, request: Request):
Expand Down
5 changes: 2 additions & 3 deletions chowda/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from json import loads
from typing import Any, ClassVar, Dict

from metaflow import Flow
from metaflow.exception import MetaflowNotFound
from requests import Request
from sqlmodel import Session, select
from starlette.responses import Response
Expand All @@ -15,9 +17,6 @@
from chowda.db import engine
from chowda.models import MediaFile

from metaflow import Flow
from metaflow.exception import MetaflowNotFound


@dataclass
class MediaFilesGuidLinkField(BaseField):
Expand Down
142 changes: 142 additions & 0 deletions docs/examples/guids.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Resolve filenames to GUIDs\n",
"\n",
"In order to address MediaFiles by GUIDs, we need to extract the GUID from the `SonyCiAsset.name` field.\n",
"\n",
"## Steps\n",
"\n",
"1. Get the list of assets from the DB\n",
"1. Filter out any name that does not start with `cpb-aacip-`\n",
"1. Split the name on any of:\n",
" - `_` underscore\n",
" - `.` period\n",
" - `-dupe`\n",
"\n",
"There are 8 records with `-dupe` in the name. All other records are correctly filtered with `_` and `.`.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from chowda.db import engine\n",
"from chowda.models import SonyCiAsset\n",
"from sqlmodel import Session, select\n",
"from re import search, split\n",
"\n",
"\n",
"def get_asset():\n",
" with Session(engine) as session:\n",
" statement = select(SonyCiAsset)\n",
" results = session.exec(statement)\n",
"\n",
" return [\n",
" (asset.id, split('_|\\.|-dupe', asset.name)[0])\n",
" for asset in results.all()\n",
" if search('^cpb-aacip-', asset.name)\n",
" ]\n",
"\n",
"\n",
"# assets = get_asset()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check the list\n",
"\n",
"If needed, write the guid list to a file and check it manually.\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# len(assets)\n",
"\n",
"\n",
"def write_assets():\n",
" with open('guids.txt', 'w') as f:\n",
" for asset in assets:\n",
" f.write(f'{asset[1]}\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Insert MediaFiles\n",
"\n",
"Iterate through the list of assets:\n",
"\n",
"- Split the GUID from the filename.\n",
"- Search the database for a matching MediaFile object.\n",
" - Insert a new MediaFile object if it does not already exist.\n",
"- Find the SonyCiAsset object in the database.\n",
"- Link the MediaFile object to the SonyCiAsset object.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from chowda.models import MediaFile\n",
"\n",
"\n",
"def insert_media_files():\n",
" with Session(engine) as session:\n",
" for asset in assets:\n",
" # Extract the GUID name\n",
" guid = split(r'_|\\.|-dupe', asset[1])[0]\n",
" # media_file = session.get(MediaFile, guid)\n",
" media_file = session.exec(\n",
" select(MediaFile).where(MediaFile.guid == guid)\n",
" ).first()\n",
" if not media_file:\n",
" # Create a new MediaFile with the new guid\n",
" media_file = MediaFile(guid=guid)\n",
" # session.add(media_file)\n",
" ci_asset = session.get(SonyCiAsset, asset[0])\n",
" # Add the asset to the existing MediaFile\n",
" media_file.assets.append(ci_asset)\n",
" # asset.media_files.append(media_file)\n",
" session.add(media_file)\n",
" session.commit()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
43 changes: 0 additions & 43 deletions migrations/versions/9a6ec8c9597c_sonyci_assets.py

This file was deleted.

Loading