Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT(index) add --sources & --overwrite to CLI to partially update db from config sources #211

Merged
merged 5 commits into from
Mar 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/GUIDE.org
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ Also see [[https://github.com/karlicoss/promnesia/issues/172][issues/172]].

** partial update

(experimental) Set env variable =PROMNESIA_INDEX_POLICY=update=.
Only index sources given in =promnesia index --sources SOURCE [SOURCE] ...=
(or all sources, if no =--sources= given), unless =--overwrite= is given,
in which case all existing visits are removed from db prior to indexing.

** exclude files from =auto= indexer

Expand Down
115 changes: 97 additions & 18 deletions src/promnesia/__main__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import argparse
import os
import logging
import inspect
import sys
from typing import List, Tuple, Optional, Dict, Sequence, Iterable, Iterator
from typing import List, Tuple, Optional, Dict, Sequence, Iterable, Iterator, Union
from pathlib import Path
from datetime import datetime
from .compat import check_call
from .compat import check_call, register_argparse_extend_action_in_pre_py38
from tempfile import TemporaryDirectory


Expand All @@ -19,7 +18,7 @@
from .extract import extract_visits, make_filter


def iter_all_visits() -> Iterator[Res[DbVisit]]:
def iter_all_visits(sources_subset: Iterable[Union[str, int]]=()) -> Iterator[Res[DbVisit]]:
cfg = config.get()
output_dir = cfg.output_dir
# not sure if belongs here??
Expand All @@ -29,8 +28,22 @@ def iter_all_visits() -> Iterator[Res[DbVisit]]:

hook = cfg.hook

indexers = cfg.sources
for idx in indexers:
indexers = list(cfg.sources)

is_subset_sources = bool(sources_subset)
if is_subset_sources:
sources_subset = set(sources_subset)

for i, idx in enumerate(indexers):
name = getattr(idx, "name", None)
if name and is_subset_sources:
matched = name in sources_subset or i in sources_subset
if matched:
sources_subset -= {i, name} # type: ignore
else:
logger.debug("skipping '%s' not in --sources.", name)
continue

if isinstance(idx, Exception):
yield idx
continue
Expand All @@ -45,12 +58,15 @@ def iter_all_visits() -> Iterator[Res[DbVisit]]:
except Exception as e:
yield e

if sources_subset:
logger.warning("unknown --sources: %s", ", ".join(repr(i) for i in sources_subset))


def _do_index(dry: bool=False) -> Iterable[Exception]:
def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), overwrite_db=False) -> Iterable[Exception]:
# also keep & return errors for further display
errors: List[Exception] = []
def it() -> Iterable[Res[DbVisit]]:
for v in iter_all_visits():
for v in iter_all_visits(sources_subset):
if isinstance(v, Exception):
errors.append(v)
yield v
Expand All @@ -61,17 +77,22 @@ def it() -> Iterable[Res[DbVisit]]:
for v in res:
print(v)
else:
dump_errors = visits_to_sqlite(it())
dump_errors = visits_to_sqlite(it(), overwrite_db=overwrite_db)
for e in dump_errors:
logger.exception(e)
errors.append(e)
return errors


def do_index(config_file: Path, dry: bool=False) -> None:
def do_index(
config_file: Path,
dry: bool=False,
sources_subset: Iterable[Union[str, int]]=(),
overwrite_db: bool=False,
) -> None:
config.load_from(config_file) # meh.. should be cleaner
try:
errors = list(_do_index(dry=dry))
errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
finally:
config.reset()
if len(errors) > 0:
Expand Down Expand Up @@ -104,7 +125,17 @@ def inner(*args, **kwargs):
return res


def do_demo(*, index_as: str, params: Sequence[str], port: Optional[str], config_file: Optional[Path], name: str='demo') -> None:
def do_demo(
*,
index_as: str,
params: Sequence[str],
port: Optional[str],
config_file: Optional[Path],
dry: bool=False,
name: str='demo',
sources_subset: Iterable[Union[str, int]]=(),
overwrite_db: bool=False,
) -> None:
from pprint import pprint
with TemporaryDirectory() as tdir:
outdir = Path(tdir)
Expand All @@ -120,7 +151,7 @@ def do_demo(*, index_as: str, params: Sequence[str], port: Optional[str], config
)
config.instance = cfg

errors = list(_do_index())
errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
if len(errors) > 0:
logger.error('%d errors during indexing (see logs above for backtraces)', len(errors))
for e in errors:
Expand Down Expand Up @@ -245,15 +276,47 @@ def cli_doctor_server(args: argparse.Namespace) -> None:
logger.info('You should see the database path and version above!')


def _ordinal_or_name(s: str) -> Union[str, int]:
try:
s = int(s) # type: ignore
except ValueError:
pass
return s


def main() -> None:
# TODO longer, literate description?

def add_index_args(parser: argparse.ArgumentParser, default_config_path: PathIsh=None) -> None:
"""
:param default_config_path:
if not given, all :func:`demo_sources()` are run
"""
register_argparse_extend_action_in_pre_py38(parser)
parser.add_argument('--config', type=Path, default=default_config_path, help='Config path')
parser.add_argument('--dry', action='store_true', help="Dry run, won't touch the database, only print the results out")
parser.add_argument(
'--sources',
required=False,
action="extend",
ankostis marked this conversation as resolved.
Show resolved Hide resolved
nargs="+",
type=_ordinal_or_name,
metavar="SOURCE",
help="Source names (or their 0-indexed position) to index.",
)
parser.add_argument(
'--overwrite',
required=False,
action="store_true",
help="Empty db before populating it with newly indexed visits."
" If interrupted, db is left untouched."
)

F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
p = argparse.ArgumentParser(formatter_class=F) # type: ignore
subp = p.add_subparsers(dest='mode', )
ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
ep.add_argument('--config', type=Path, default=default_config_path(), help='Config path')
ep.add_argument('--dry', action='store_true', help="Dry run, won't touch the database, only print the results out")
add_index_args(ep, default_config_path())
# TODO use some way to override or provide config only via cmdline?
ep.add_argument('--intermediate', required=False, help="Used for development, you don't need it")

Expand All @@ -269,13 +332,13 @@ def main() -> None:
ap.add_argument('--name', type=str, default='demo' , help='Set custom source name')
add_port_arg(ap)
ap.add_argument('--no-serve', action='store_const', const=None, dest='port', help='Pass to only index without running server')
ap.add_argument('--config', type=Path, required=False , help='Config to run against. If omitted, will use empty base config')
ankostis marked this conversation as resolved.
Show resolved Hide resolved
ap.add_argument(
'--as',
choices=list(sorted(demo_sources().keys())),
default='guess',
help='Promnesia source to index as (see https://github.com/karlicoss/promnesia/tree/master/src/promnesia/sources for the full list)',
)
add_index_args(ap)
ap.add_argument('params', nargs='*', help='Optional extra params for the indexer')

isp = subp.add_parser('install-server', help='Install server as a systemd service (for autostart)', formatter_class=F)
Expand Down Expand Up @@ -309,19 +372,35 @@ def main() -> None:
p.print_help(sys.stderr)
sys.exit(1)

logger.info("CLI args: %s", args)

# TODO maybe, it's better for server to compute intermediate represetnation?
# the only downside is storage. dunno.
# worst case -- could use database?

with get_tmpdir() as tdir: # TODO??
if args.mode == 'index':
do_index(config_file=args.config, dry=args.dry)
do_index(
config_file=args.config,
dry=args.dry,
sources_subset=args.sources,
overwrite_db=args.overwrite,
)
elif args.mode == 'serve':
server.run(args)
elif args.mode == 'demo':
# TODO not sure if 'as' is that useful
# something like Telegram/Takeout is too hard to setup to justify adhoc mode like this?
do_demo(index_as=getattr(args, 'as'), params=args.params, port=args.port, config_file=args.config, name=args.name)
do_demo(
index_as=getattr(args, 'as'),
params=args.params,
port=args.port,
config_file=args.config,
dry=args.dry,
name=args.name,
sources_subset=args.sources,
overwrite_db=args.overwrite,
)
elif args.mode == 'install-server': # todo rename to 'autostart' or something?
install_server.install(args)
elif args.mode == 'config':
Expand Down
23 changes: 21 additions & 2 deletions src/promnesia/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,36 @@ def _fix(args: Paths) -> List[str]:
assert not isinstance(args, str), args # just to prevent shell=True calls...
return list(map(str, args))


import argparse

def register_argparse_extend_action_in_pre_py38(parser: argparse.ArgumentParser):
import sys

if sys.version_info < (3, 8):

class ExtendAction(argparse.Action):

def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, self.dest) or []
items.extend(values)
setattr(namespace, self.dest, items)


parser.register('action', 'extend', ExtendAction)


import subprocess

def run(args: Paths, **kwargs) -> subprocess.CompletedProcess:
return subprocess.run(_fix(args), **kwargs)

def check_call(args: Paths, **kwargs) -> None:
subprocess.check_call(_fix(args), **kwargs)

def check_output(args: Paths, **kwargs) -> bytes:
return subprocess.check_output(_fix(args), **kwargs)

def Popen(args: Paths, **kwargs) -> subprocess.Popen:
return subprocess.Popen(_fix(args), **kwargs)

Expand Down
7 changes: 3 additions & 4 deletions src/promnesia/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,9 @@ def cache_dir(self) -> Optional[Path]:
@property
def output_dir(self) -> Path:
odir = self.OUTPUT_DIR
if odir is not None:
return Path(odir)
else:
return default_output_dir()
opath = default_output_dir() if odir is None else Path(odir)
opath.mkdir(exist_ok=True, parents=True)
return opath

@property
def db(self) -> Path:
Expand Down
18 changes: 4 additions & 14 deletions src/promnesia/dump.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from pathlib import Path
import shutil
from typing import Dict, List, Tuple, Set, Iterable
Expand All @@ -14,14 +13,6 @@
from . import config


def update_policy_active() -> bool:
# NOTE: experimental.. need to make it a proper cmdline argument later
INDEX_POLICY = os.environ.get('PROMNESIA_INDEX_POLICY', 'overwrite_all')
# if 'update' is passed, will run against the existing db and only tough the sources present in the current index run
# not sue if a good name for this..
return INDEX_POLICY == 'update'


# NOTE: I guess the main performance benefit from this is not creating too many tmp lists and avoiding overhead
# since as far as sql is concerned it should all be in the same transaction. only a guess
# not sure it's the proper way to handle it
Expand All @@ -30,7 +21,7 @@ def update_policy_active() -> bool:


# returns critical warnings
def visits_to_sqlite(vit: Iterable[Res[DbVisit]]) -> List[Exception]:
def visits_to_sqlite(vit: Iterable[Res[DbVisit]], *, overwrite_db: bool) -> List[Exception]:
logger = get_logger()
db_path = config.get().db

Expand Down Expand Up @@ -58,8 +49,7 @@ def vit_ok() -> Iterable[DbVisit]:
yield ev

tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
policy_update = update_policy_active()
if not policy_update:
if overwrite_db:
engine = create_engine(f'sqlite:///{tpath}')
else:
engine = create_engine(f'sqlite:///{db_path}')
Expand All @@ -82,12 +72,12 @@ def vit_ok() -> Iterable[DbVisit]:
# pylint: disable=no-value-for-parameter
conn.execute(table.insert().values(bound))

if not policy_update:
if overwrite_db:
shutil.move(str(tpath), str(db_path))

errs = '' if errors == 0 else f', {errors} ERRORS'
total = ok + errors
what = 'updated' if policy_update else 'overwritten'
what = 'overwritten' if overwrite_db else 'updated'
logger.info('%s database "%s". %d total (%d OK%s)', what, db_path, total, ok, errs)
res: List[Exception] = []
if total == 0:
Expand Down
10 changes: 1 addition & 9 deletions tests/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,7 @@

def run_index(cfg: Path, *, update=False) -> None:
from promnesia.__main__ import do_index
if update:
ev = 'PROMNESIA_INDEX_POLICY'
os.environ[ev] = 'update'
try:
do_index(cfg)
finally:
del os.environ[ev]
else:
do_index(cfg)
do_index(cfg, overwrite_db=not update)


index = run_index # legacy name
Expand Down