From d2aa9c321d89392dcfc16bb7571f6a3c12a2dcc3 Mon Sep 17 00:00:00 2001 From: rimu <3310831+rimu@users.noreply.github.com> Date: Sun, 24 Nov 2024 16:00:53 +1300 Subject: [PATCH] clean up duplicate comments #348 --- app/models.py | 12 ++- .../c3cc707ab5e9_unique_post_reply_ap_id.py | 73 +++++++++++++++++++ 2 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 migrations/versions/c3cc707ab5e9_unique_post_reply_ap_id.py diff --git a/app/models.py b/app/models.py index 19923c34..96bec31c 100644 --- a/app/models.py +++ b/app/models.py @@ -1631,7 +1631,7 @@ class PostReply(db.Model): edited_at = db.Column(db.DateTime) reports = db.Column(db.Integer, default=0) # how many times this post has been reported. Set to -1 to ignore reports - ap_id = db.Column(db.String(255), index=True) + ap_id = db.Column(db.String(255), index=True, unique=True) ap_create_id = db.Column(db.String(100)) ap_announce_id = db.Column(db.String(100)) @@ -1664,7 +1664,7 @@ class PostReply(db.Model): from_bot=user.bot, nsfw=post.nsfw, nsfl=post.nsfl, notify_author=notify_author, instance_id=user.instance_id, language_id=language_id, - ap_id=request_json['object']['id'] if request_json else None, + ap_id=request_json['object']['id'].lower() if request_json else None, ap_create_id=request_json['id'] if request_json else None, ap_announce_id=announce_id) if reply.body: @@ -1689,8 +1689,12 @@ class PostReply(db.Model): if reply_is_stupid(reply.body): raise Exception('Low quality reply') - db.session.add(reply) - db.session.commit() + try: + db.session.add(reply) + db.session.commit() + except IntegrityError: + db.session.rollback() + return PostReply.query.filter_by(ap_id=request_json['object']['id'].lower()).one() # Notify subscribers notify_about_post_reply(in_reply_to, reply) diff --git a/migrations/versions/c3cc707ab5e9_unique_post_reply_ap_id.py b/migrations/versions/c3cc707ab5e9_unique_post_reply_ap_id.py new file mode 100644 index 00000000..8e5de9e4 --- /dev/null +++ b/migrations/versions/c3cc707ab5e9_unique_post_reply_ap_id.py @@ -0,0 +1,73 @@ +"""unique post_reply ap id + +Revision ID: c3cc707ab5e9 +Revises: 299e0384c8f3 +Create Date: 2024-11-24 15:47:03.293286 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = 'c3cc707ab5e9' +down_revision = '299e0384c8f3' +branch_labels = None +depends_on = None + + +def upgrade(): + # Find duplicate communities by ap_profile_id + dupes_query = text(''' + SELECT ap_id FROM "post_reply" + GROUP BY ap_id + HAVING COUNT(*) > 1 + ''') + + conn = op.get_bind() + duplicate_comments = conn.execute(dupes_query).scalars() + print('Cleaning up duplicate comments, this may take a while...') + + for ap_id in duplicate_comments: + if ap_id is None: + continue + # Get all communities with the same ap_profile_id, ordered by ID + comments_query = text(''' + SELECT id FROM "post_reply" + WHERE ap_id = :ap_id + ORDER BY id + ''') + comments = conn.execute(comments_query, {"ap_id": ap_id}).fetchall() + + # Set the lowest ID as the new_id, and collect other IDs to update/delete + new_id = comments[0].id + old_ids = [comment.id for comment in comments[1:]] + + print(ap_id) + + if old_ids: + # Update tables with batch IN clause + conn.execute(text('DELETE FROM "post_reply_vote" WHERE post_reply_id IN :old_ids'), + {"old_ids": tuple(old_ids)}) + conn.execute(text('DELETE FROM "report" WHERE suspect_post_reply_id IN :old_ids'), + {"old_ids": tuple(old_ids)}) + conn.execute(text('DELETE FROM "post_reply_bookmark" WHERE post_reply_id IN :old_ids'), + {"old_ids": tuple(old_ids)}) + + # Delete the duplicate comments + conn.execute(text('DELETE FROM "post_reply" WHERE id IN :old_ids'), {"old_ids": tuple(old_ids)}) + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('post_reply', schema=None) as batch_op: + batch_op.drop_index('ix_post_reply_ap_id') + batch_op.create_index(batch_op.f('ix_post_reply_ap_id'), ['ap_id'], unique=True) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('post_reply', schema=None) as batch_op: + batch_op.drop_index(batch_op.f('ix_post_reply_ap_id')) + batch_op.create_index('ix_post_reply_ap_id', ['ap_id'], unique=False) + + # ### end Alembic commands ###