21
21
22
22
import itertools
23
23
import logging
24
- from typing import TYPE_CHECKING , Collection , Mapping , Set
24
+ from typing import (
25
+ TYPE_CHECKING ,
26
+ Collection ,
27
+ Mapping ,
28
+ Optional ,
29
+ Set ,
30
+ )
25
31
26
32
from synapse .logging .context import nested_logging_context
27
33
from synapse .metrics .background_process_metrics import wrap_as_background_process
34
+ from synapse .storage .database import LoggingTransaction
28
35
from synapse .storage .databases import Databases
36
+ from synapse .types .storage import _BackgroundUpdates
29
37
30
38
if TYPE_CHECKING :
31
39
from synapse .server import HomeServer
@@ -44,6 +52,11 @@ def __init__(self, hs: "HomeServer", stores: Databases):
44
52
self ._delete_state_groups_loop , 60 * 1000
45
53
)
46
54
55
+ self .stores .state .db_pool .updates .register_background_update_handler (
56
+ _BackgroundUpdates .MARK_UNREFERENCED_STATE_GROUPS_FOR_DELETION_BG_UPDATE ,
57
+ self ._background_delete_unrefereneced_state_groups ,
58
+ )
59
+
47
60
async def purge_room (self , room_id : str ) -> None :
48
61
"""Deletes all record of a room"""
49
62
@@ -81,7 +94,8 @@ async def purge_history(
81
94
)
82
95
83
96
async def _find_unreferenced_groups (
84
- self , state_groups : Collection [int ]
97
+ self ,
98
+ state_groups : Collection [int ],
85
99
) -> Set [int ]:
86
100
"""Used when purging history to figure out which state groups can be
87
101
deleted.
@@ -203,3 +217,232 @@ async def _delete_state_groups(
203
217
room_id ,
204
218
groups_to_sequences ,
205
219
)
220
+
221
+ async def _background_delete_unrefereneced_state_groups (
222
+ self , progress : dict , batch_size : int
223
+ ) -> int :
224
+ """This background update will slowly delete any unreferenced state groups"""
225
+
226
+ last_checked_state_group = progress .get ("last_checked_state_group" )
227
+
228
+ if last_checked_state_group is None :
229
+ # This is the first run.
230
+ last_checked_state_group = (
231
+ await self .stores .state .db_pool .simple_select_one_onecol (
232
+ table = "state_groups" ,
233
+ keyvalues = {},
234
+ retcol = "MAX(id)" ,
235
+ allow_none = True ,
236
+ desc = "get_max_state_group" ,
237
+ )
238
+ )
239
+ if last_checked_state_group is None :
240
+ # There are no state groups so the background process is finished.
241
+ await self .stores .state .db_pool .updates ._end_background_update (
242
+ _BackgroundUpdates .MARK_UNREFERENCED_STATE_GROUPS_FOR_DELETION_BG_UPDATE
243
+ )
244
+ return batch_size
245
+ last_checked_state_group += 1
246
+
247
+ (
248
+ last_checked_state_group ,
249
+ final_batch ,
250
+ ) = await self ._delete_unreferenced_state_groups_batch (
251
+ last_checked_state_group ,
252
+ batch_size ,
253
+ )
254
+
255
+ if not final_batch :
256
+ # There are more state groups to check.
257
+ progress = {
258
+ "last_checked_state_group" : last_checked_state_group ,
259
+ }
260
+ await self .stores .state .db_pool .updates ._background_update_progress (
261
+ _BackgroundUpdates .MARK_UNREFERENCED_STATE_GROUPS_FOR_DELETION_BG_UPDATE ,
262
+ progress ,
263
+ )
264
+ else :
265
+ # This background process is finished.
266
+ await self .stores .state .db_pool .updates ._end_background_update (
267
+ _BackgroundUpdates .MARK_UNREFERENCED_STATE_GROUPS_FOR_DELETION_BG_UPDATE
268
+ )
269
+
270
+ return batch_size
271
+
272
+ async def _delete_unreferenced_state_groups_batch (
273
+ self ,
274
+ last_checked_state_group : int ,
275
+ batch_size : int ,
276
+ ) -> tuple [int , bool ]:
277
+ """Looks for unreferenced state groups starting from the last state group
278
+ checked and marks them for deletion.
279
+
280
+ Args:
281
+ last_checked_state_group: The last state group that was checked.
282
+ batch_size: How many state groups to process in this iteration.
283
+
284
+ Returns:
285
+ (last_checked_state_group, final_batch)
286
+ """
287
+
288
+ # Find all state groups that can be deleted if any of the original set are deleted.
289
+ (
290
+ to_delete ,
291
+ last_checked_state_group ,
292
+ final_batch ,
293
+ ) = await self ._find_unreferenced_groups_for_background_deletion (
294
+ last_checked_state_group , batch_size
295
+ )
296
+
297
+ if len (to_delete ) == 0 :
298
+ return last_checked_state_group , final_batch
299
+
300
+ await self .stores .state_deletion .mark_state_groups_as_pending_deletion (
301
+ to_delete
302
+ )
303
+
304
+ return last_checked_state_group , final_batch
305
+
306
+ async def _find_unreferenced_groups_for_background_deletion (
307
+ self ,
308
+ last_checked_state_group : int ,
309
+ batch_size : int ,
310
+ ) -> tuple [Set [int ], int , bool ]:
311
+ """Used when deleting unreferenced state groups in the background to figure out
312
+ which state groups can be deleted.
313
+ To avoid increased DB usage due to de-deltaing state groups, this returns only
314
+ state groups which are free standing (ie. no shared edges with referenced groups) or
315
+ state groups which do not share edges which result in a future referenced group.
316
+
317
+ The following scenarios outline the possibilities based on state group data in
318
+ the DB.
319
+
320
+ ie. Free standing -> state groups 1-N would be returned:
321
+ SG_1
322
+ |
323
+ ...
324
+ |
325
+ SG_N
326
+
327
+ ie. Previous reference -> state groups 2-N would be returned:
328
+ SG_1 <- referenced by event
329
+ |
330
+ SG_2
331
+ |
332
+ ...
333
+ |
334
+ SG_N
335
+
336
+ ie. Future reference -> none of the following state groups would be returned:
337
+ SG_1
338
+ |
339
+ SG_2
340
+ |
341
+ ...
342
+ |
343
+ SG_N <- referenced by event
344
+
345
+ Args:
346
+ last_checked_state_group: The last state group that was checked.
347
+ batch_size: How many state groups to process in this iteration.
348
+
349
+ Returns:
350
+ (to_delete, last_checked_state_group, final_batch)
351
+ """
352
+
353
+ # If a state group's next edge is not pending deletion then we don't delete the state group.
354
+ # If there is no next edge or the next edges are all marked for deletion, then delete
355
+ # the state group.
356
+ # This holds since we walk backwards from the latest state groups, ensuring that
357
+ # we've already checked newer state groups for event references along the way.
358
+ def get_next_state_groups_marked_for_deletion_txn (
359
+ txn : LoggingTransaction ,
360
+ ) -> tuple [dict [int , bool ], dict [int , int ]]:
361
+ state_group_sql = """
362
+ SELECT s.id, e.state_group, d.state_group
363
+ FROM (
364
+ SELECT id FROM state_groups
365
+ WHERE id < ? ORDER BY id DESC LIMIT ?
366
+ ) as s
367
+ LEFT JOIN state_group_edges AS e ON (s.id = e.prev_state_group)
368
+ LEFT JOIN state_groups_pending_deletion AS d ON (e.state_group = d.state_group)
369
+ """
370
+ txn .execute (state_group_sql , (last_checked_state_group , batch_size ))
371
+
372
+ # Mapping from state group to whether we should delete it.
373
+ state_groups_to_deletion : dict [int , bool ] = {}
374
+
375
+ # Mapping from state group to prev state group.
376
+ state_groups_to_prev : dict [int , int ] = {}
377
+
378
+ for row in txn :
379
+ state_group = row [0 ]
380
+ next_edge = row [1 ]
381
+ pending_deletion = row [2 ]
382
+
383
+ if next_edge is not None :
384
+ state_groups_to_prev [next_edge ] = state_group
385
+
386
+ if next_edge is not None and not pending_deletion :
387
+ # We have found an edge not marked for deletion.
388
+ # Check previous results to see if this group is part of a chain
389
+ # within this batch that qualifies for deletion.
390
+ # ie. batch contains:
391
+ # SG_1 -> SG_2 -> SG_3
392
+ # If SG_3 is a candidate for deletion, then SG_2 & SG_1 should also
393
+ # be, even though they have edges which may not be marked for
394
+ # deletion.
395
+ # This relies on SQL results being sorted in DESC order to work.
396
+ next_is_deletion_candidate = state_groups_to_deletion .get (next_edge )
397
+ if (
398
+ next_is_deletion_candidate is None
399
+ or not next_is_deletion_candidate
400
+ ):
401
+ state_groups_to_deletion [state_group ] = False
402
+ else :
403
+ state_groups_to_deletion .setdefault (state_group , True )
404
+ else :
405
+ # This state group may be a candidate for deletion
406
+ state_groups_to_deletion .setdefault (state_group , True )
407
+
408
+ return state_groups_to_deletion , state_groups_to_prev
409
+
410
+ (
411
+ state_groups_to_deletion ,
412
+ state_group_edges ,
413
+ ) = await self .stores .state .db_pool .runInteraction (
414
+ "get_next_state_groups_marked_for_deletion" ,
415
+ get_next_state_groups_marked_for_deletion_txn ,
416
+ )
417
+ deletion_candidates = {
418
+ state_group
419
+ for state_group , deletion in state_groups_to_deletion .items ()
420
+ if deletion
421
+ }
422
+
423
+ final_batch = False
424
+ state_groups = state_groups_to_deletion .keys ()
425
+ if len (state_groups ) < batch_size :
426
+ final_batch = True
427
+ else :
428
+ last_checked_state_group = min (state_groups )
429
+
430
+ if len (state_groups ) == 0 :
431
+ return set (), last_checked_state_group , final_batch
432
+
433
+ # Determine if any of the remaining state groups are directly referenced.
434
+ referenced = await self .stores .main .get_referenced_state_groups (
435
+ deletion_candidates
436
+ )
437
+
438
+ # Remove state groups from deletion_candidates which are directly referenced or share a
439
+ # future edge with a referenced state group within this batch.
440
+ def filter_reference_chains (group : Optional [int ]) -> None :
441
+ while group is not None :
442
+ deletion_candidates .discard (group )
443
+ group = state_group_edges .get (group )
444
+
445
+ for referenced_group in referenced :
446
+ filter_reference_chains (referenced_group )
447
+
448
+ return deletion_candidates , last_checked_state_group , final_batch
0 commit comments