get rid of dataset library, it's not maintained anymore

karlicoss · Aug 22, 2023 · 8a5b2f3 · 8a5b2f3
1 parent a3e0562
commit 8a5b2f3
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 51 deletions.
diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def main() -> None:
         author_email='karlicoss@gmail.com',
         description='Backup and extract data from your Kobo reader',
 
-        install_requires=['pytz', 'dataset'],
+        install_requires=['pytz'],
         extras_require={
             'testing': ['pytest'],
             'linting': ['pytest', 'mypy', 'lxml', 'types-pytz'],

diff --git a/src/kobuddy/__init__.py b/src/kobuddy/__init__.py
@@ -11,6 +11,7 @@
 import json
 import shutil
 import struct
+import sqlite3
 from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
@@ -19,13 +20,11 @@
 from typing import (Dict, Iterator, List, NamedTuple, Optional, Sequence, Set,
                     Tuple, Union, Iterable, Any)
 
-import dataset # type: ignore
 import pytz
 
 from .common import get_logger, unwrap, cproperty, group_by_key, the, nullcontext, Res, sorted_res, split_res
 from .kobo_device import get_kobo_mountpoint
-
-
+from .sqlite import sqlite_connection
 
 
 # a bit nasty to have a global variable here... will rewrite later
@@ -379,28 +378,26 @@ def by_dict(self, d) -> Book:
         assert res is not None, d
         return res
 
+
 class Extra(NamedTuple):
     time_spent: int
     percent: int
     status: int
     last_read: Optional[datetime]
 
 
-def _load_books(db) -> List[Tuple[Book, Extra]]:
+def _load_books(db: sqlite3.Connection) -> List[Tuple[Book, Extra]]:
     logger = get_logger()
-    content_table = db.load_table('content')
     items: List[Tuple[Book, Extra]] = []
-    # wtf... that fails with some sqlalchemy crap
-    # books = content_table.find(ContentType=6)
-    books = db.query('SELECT * FROM content WHERE ContentType=6')
+    books = db.execute('SELECT * FROM content WHERE ContentType=6')
     for b in books:
         content_id = b['ContentID']
         isbn       = b['ISBN']
         title      = b['Title'].strip() # ugh. not sure about that, but sometimes helped
         author     = b['Attribution']
 
         # TODO not so sure about that; it was the case for KoboShelfes databases
-        time_spent = b.get('TimeSpentReading', 0)
+        time_spent = 0 if 'TimeSpentReading' not in b.keys() else b['TimeSpentReading']
         percent    = b['___PercentRead']
         status     = int(b['ReadStatus'])
         last_read  = b['DateLastRead']
@@ -503,13 +500,6 @@ class Types:
         #
 
 
-def dataset_connect_ro(db: Path):
-    # support read only filesystems (also guarantees we don't change the database by accident)
-    import sqlite3
-    creator = lambda: sqlite3.connect(f'file:{db}?immutable=1', uri=True)
-    return dataset.connect('sqlite:///', engine_kwargs={'creator': creator})
-
-
 # TODO use literal mypy types?
 def _iter_events_aux(limit=None, errors='throw') -> Iterator[Res[Event]]:
     # TODO handle all_ here?
@@ -522,10 +512,15 @@ def _iter_events_aux(limit=None, errors='throw') -> Iterator[Res[Event]]:
     # TODO do it if it's defensive?
     books = Books(create_if_missing=True)
 
-    for fname in dbs:
-        logger.info('processing %s', fname)
-        db = dataset_connect_ro(fname)
 
+    def connections():
+        for fname in dbs:
+            logger.info(f'processing {fname}')
+            with sqlite_connection(fname, immutable=True, row_factory='row') as db:
+                yield fname, db
+
+
+    for fname, db in connections():
         for b, extra in _load_books(db):
             books.add(b)
             if extra is None:
@@ -536,7 +531,7 @@ def _iter_events_aux(limit=None, errors='throw') -> Iterator[Res[Event]]:
                 yield FinishedEvent(dt=dt, book=b, time_spent_s=extra.time_spent, eid=f'{b.content_id}-{fname.name}')
 
         ET = EventTbl
-        for i, row in enumerate(db.query(f'SELECT {ET.EventType}, {ET.EventCount}, {ET.LastOccurrence}, {ET.ContentID}, {ET.Checksum}, hex({ET.ExtraData}) from Event')): # TODO order by?
+        for i, row in enumerate(db.execute(f'SELECT {ET.EventType}, {ET.EventCount}, {ET.LastOccurrence}, {ET.ContentID}, {ET.Checksum}, hex({ET.ExtraData}) from Event')): # TODO order by?
             try:
                 yield from _iter_events_aux_Event(row=row, books=books, idx=i)
             except Exception as e:
@@ -550,7 +545,7 @@ def _iter_events_aux(limit=None, errors='throw') -> Iterator[Res[Event]]:
         AE = AnalyticsEvents
         # events_table = db.load_table('AnalyticsEvents')
         # TODO ugh. used to be events_table.all(), but started getting some 'Mandatory' field with a wrong schema at some point...
-        for row in db.query(f'SELECT {AE.Id}, {AE.Timestamp}, {AE.Type}, {AE.Attributes}, {AE.Metrics} from AnalyticsEvents'): # TODO order by??
+        for row in db.execute(f'SELECT {AE.Id}, {AE.Timestamp}, {AE.Type}, {AE.Attributes}, {AE.Metrics} from AnalyticsEvents'): # TODO order by??
             try:
                 yield from _iter_events_aux_AnalyticsEvents(row=row, books=books)
             except Exception as e:
@@ -839,18 +834,19 @@ def _iter_events_aux_AnalyticsEvents(*, row, books: Books) -> Iterator[Event]:
         logger.warning(f'Unhandled entry of type {tp}: {row}')
 
 
-def _get_books():
+def _get_books() -> Books:
     books = Books()
     for bfile in DATABASES:
-        # TODO dispose?
-        db = dataset_connect_ro(bfile)
-        for b, _ in _load_books(db):
-            books.add(b)
+        with sqlite_connection(bfile, immutable=True, row_factory='row') as db:
+            for b, _ in _load_books(db):
+                books.add(b)
     return books
 
-def get_books():
+
+def get_books() -> List[Book]:
     return _get_books().all()
 
+
 def _iter_highlights(**kwargs) -> Iterator[Highlight]:
     logger = get_logger()
     books = _get_books()
@@ -864,31 +860,31 @@ def _iter_highlights(**kwargs) -> Iterator[Highlight]:
                 yielded.add(h)
 
 
-def _load_highlights(bfile: Path, books: Books):
+def _load_highlights(bfile: Path, books: Books) -> Iterator[Highlight]:
     logger = get_logger()
-    logger.info(f"Using %s for highlights", bfile)
-    db = dataset_connect_ro(bfile)
-    for bm in db.query('SELECT * FROM Bookmark'):
-        volumeid = bm['VolumeID']
-        mbook = books.by_content_id(volumeid)
-        if mbook is None:
-            # sometimes Kobo seems to recycle old books without recycling the corresponding bookmarks
-            # so we need to be a bit defensive here
-            # see https://github.com/karlicoss/kobuddy/issues/18
-            book = books.make_orphan_book(volumeid=volumeid)
-        else:
-            book = mbook
-        # todo maybe in the future it could be a matter of error policy, i.e. throw vs yield exception vs use orphan object vs ignore
-        # could be example of useful defensiveness in a provider
-        yield Highlight(bm, book=book)
+    logger.info(f"Using {bfile} for highlights")
+    with sqlite_connection(bfile, immutable=True, row_factory='row') as db:
+        for bm in db.execute('SELECT * FROM Bookmark'):
+            volumeid = bm['VolumeID']
+            mbook = books.by_content_id(volumeid)
+            if mbook is None:
+                # sometimes Kobo seems to recycle old books without recycling the corresponding bookmarks
+                # so we need to be a bit defensive here
+                # see https://github.com/karlicoss/kobuddy/issues/18
+                book = books.make_orphan_book(volumeid=volumeid)
+            else:
+                book = mbook
+            # todo maybe in the future it could be a matter of error policy, i.e. throw vs yield exception vs use orphan object vs ignore
+            # could be example of useful defensiveness in a provider
+            yield Highlight(bm, book=book)
 
 
 def _load_wordlist(bfile: Path):
     logger = get_logger()
-    logger.info(f"Using %s for highlights", bfile)
-    db = dataset_connect_ro(bfile)
-    for bm in db.query('SELECT * FROM WordList'):
-        yield bm['Text']
+    logger.info(f"Using {bfile} for wordlist")
+    with sqlite_connection(bfile, immutable=True, row_factory='row') as db:
+        for bm in db.execute('SELECT * FROM WordList'):
+            yield bm['Text']
 
 
 def get_highlights(**kwargs) -> List[Highlight]:
@@ -971,6 +967,7 @@ def _event_key(evt):
         tie_breaker = 2
     return (evt.dt, tie_breaker)
 
+
 class BookEvents:
     def __init__(self, book: Book, events):
         assert all(e.book == book for e in events)
@@ -1011,10 +1008,12 @@ def get_books_with_events(**kwargs) -> Sequence[Res[BookEvents]]:
     vit = sorted(vit, key=lambda be: be.last)
     return list(chain(vit, eit))
 
+
 def _fmt_dt(dt: datetime) -> str:
     return dt.strftime('%d %b %Y %H:%M')
 
-def print_progress(full=True, **kwargs):
+
+def print_progress(full=True, **kwargs) -> None:
     logger = get_logger()
     for bevents in get_books_with_events(**kwargs):
         if isinstance(bevents, Exception):
@@ -1032,12 +1031,12 @@ def print_progress(full=True, **kwargs):
                 print(f"-- {_fmt_dt(e.dt)}: {e.summary}")
 
 
-def print_books():
+def print_books() -> None:
     for b in get_books():
         print(b)
 
 
-def print_annotations():
+def print_annotations() -> None:
     for i in get_highlights():
         h = f"""
 {_fmt_dt(i.dt)} {i._book}

diff --git a/src/kobuddy/sqlite.py b/src/kobuddy/sqlite.py
@@ -0,0 +1,47 @@
+# copy from HPI:my/core/sqlite.py
+from contextlib import contextmanager
+from pathlib import Path
+import sqlite3
+from typing import Callable, Any, Union, Literal, Optional, Iterator
+
+
+PathIsh = Union[Path, str]
+
+
+SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
+
+
+def dict_factory(cursor, row):
+    fields = [column[0] for column in cursor.description]
+    return {key: value for key, value in zip(fields, row)}
+
+
+Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
+
+@contextmanager
+def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
+    dbp = f'file:{db}'
+    # https://www.sqlite.org/draft/uri.html#uriimmutable
+    if immutable:
+        # assert results in nicer error than sqlite3.OperationalError
+        assert Path(db).exists(), db
+        dbp = f'{dbp}?immutable=1'
+    row_factory_: Any = None
+    if row_factory is not None:
+        if callable(row_factory):
+            row_factory_ = row_factory
+        elif row_factory == 'row':
+            row_factory_ = sqlite3.Row
+        elif row_factory == 'dict':
+            row_factory_ = dict_factory
+        else:
+            raise RuntimeError(f"Can't happen: {row_factory}")
+
+    conn = sqlite3.connect(dbp, uri=True)
+    try:
+        conn.row_factory = row_factory_
+        with conn:
+            yield conn
+    finally:
+        # Connection context manager isn't actually closing the connection, only keeps transaction
+        conn.close()