matrix-org · richvdh · Jun 1, 2021 · May 7, 2021 · May 27, 2021 · May 12, 2021
diff --git a/changelog.d/9953.misc b/changelog.d/9953.misc
@@ -0,0 +1 @@
+Add a cache to `have_seen_events`.
@@ -168,6 +168,7 @@ def _invalidate_caches_for_event(
         backfilled,
     ):
         self._invalidate_get_event_cache(event_id)
+        self.have_seen_event.prefill((event_id,), True)
 
         self.get_latest_event_ids_in_room.invalidate((room_id,))
 

@@ -22,6 +22,7 @@
     Iterable,
     List,
     Optional,
+    Set,
     Tuple,
     overload,
 )
@@ -55,7 +56,7 @@
 from synapse.storage.util.id_generators import MultiWriterIdGenerator, StreamIdGenerator
 from synapse.storage.util.sequence import build_sequence_generator
 from synapse.types import JsonDict, get_domain_from_id
-from synapse.util.caches.descriptors import cached
+from synapse.util.caches.descriptors import cached, cachedList
 from synapse.util.caches.lrucache import LruCache
 from synapse.util.iterutils import batch_iter
 from synapse.util.metrics import Measure
@@ -1046,7 +1047,7 @@ async def have_events_in_timeline(self, event_ids):
 
         return {r["event_id"] for r in rows}
 
-    async def have_seen_events(self, event_ids):
+    async def have_seen_events(self, event_ids: Collection[str]) -> Set[str]:
         """Given a list of event ids, check if we have already processed them.
 
         Args:
@@ -1055,23 +1056,46 @@ async def have_seen_events(self, event_ids):
         Returns:
             set[str]: The events we have already seen.
         """
+        res = await self._have_seen_events_dict(event_ids)
+        return {x for (x, y) in res.items() if y}
+
+    @cachedList("have_seen_event", "event_ids")
+    async def _have_seen_events_dict(
+        self, event_ids: Collection[str]
+    ) -> Dict[str, bool]:
+        """Helper for have_seen_events
+
+        Returns a dict, which is the right format for @cachedList
+        """
         # if the event cache contains the event, obviously we've seen it.
-        results = {x for x in event_ids if self._get_event_cache.contains(x)}
+        cache_results = {x for x in event_ids if self._get_event_cache.contains(x)}
+        results = {x: True for x in cache_results}
 
         def have_seen_events_txn(txn, chunk):
+            # assume everything in this chunk is not found initially
+            results.update({x: False for x in chunk})
+
+            # check the db and update the results for any row that is found
             sql = "SELECT event_id FROM events as e WHERE "
             clause, args = make_in_list_sql_clause(
                 txn.database_engine, "e.event_id", chunk
             )
             txn.execute(sql + clause, args)
-            results.update(row[0] for row in txn)
+            results.update({row[0]: True for row in txn})
 
-        for chunk in batch_iter((x for x in event_ids if x not in results), 100):
+        for chunk in batch_iter((x for x in event_ids if x not in cache_results), 100):
             await self.db_pool.runInteraction(
                 "have_seen_events", have_seen_events_txn, chunk
             )
+
         return results
 
+    @cached(max_entries=100000)
+    async def have_seen_event(self, event_id):
+        # this only exists for the benefit of the @cachedList descriptor on
+        # _have_seen_events_dict
+        raise NotImplementedError()
+
     def _get_current_state_event_counts_txn(self, txn, room_id):
         """
         See get_current_state_event_counts.

@@ -16,14 +16,14 @@
 from typing import Any, List, Set, Tuple
 
 from synapse.api.errors import SynapseError
-from synapse.storage._base import SQLBaseStore
+from synapse.storage.databases.main import CacheInvalidationWorkerStore
 from synapse.storage.databases.main.state import StateGroupWorkerStore
 from synapse.types import RoomStreamToken
 
 logger = logging.getLogger(__name__)
 
 
-class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
+class PurgeEventsStore(StateGroupWorkerStore, CacheInvalidationWorkerStore):
     async def purge_history(
         self, room_id: str, token: str, delete_local_events: bool
     ) -> Set[int]:
@@ -203,8 +203,6 @@ def _purge_history_txn(
             "DELETE FROM event_to_state_groups "
             "WHERE event_id IN (SELECT event_id from events_to_purge)"
         )
-        for event_id, _ in event_rows:
-            txn.call_after(self._get_state_group_for_event.invalidate, (event_id,))
 
         # Delete all remote non-state events
         for table in (
@@ -283,6 +281,18 @@ def _purge_history_txn(
         # so make sure to keep this actually last.
         txn.execute("DROP TABLE events_to_purge")
 
+        for event_id, should_delete in event_rows:
+            self._invalidate_cache_and_stream(
+                txn, self._get_state_group_for_event, (event_id,)
+            )
+
+            # FIXME: this is racy - what if have_seen_event gets called between the
+            #    transaction completing and the invalidation running?
+            if should_delete:
+                self._invalidate_cache_and_stream(
+                    txn, self.have_seen_event, (event_id,)
+                )
+
         logger.info("[purge] done")
 
         return referenced_state_groups
@@ -422,7 +432,14 @@ def _purge_room_txn(self, txn, room_id: str) -> List[int]:
         #       index on them. In any case we should be clearing out 'stream' tables
         #       periodically anyway (#5888)
 
-        # TODO: we could probably usefully do a bunch of cache invalidation here
+        # TODO: we could probably usefully do a bunch more cache invalidation here
+
+        # we have no way to know which events to clear out of have_seen_event
+        # so just have to drop the whole cache
+        #
+        # FIXME: this is racy - what if have_seen_event gets called between the
+        #    DELETE completing and the invalidation running?
+        self._invalidate_all_cache_and_stream(txn, self.have_seen_event)
 
         logger.info("[purge] done")