From 5210352e83a212a5052ad72555718b625815e194 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Mon, 2 Dec 2024 14:29:11 -0500 Subject: [PATCH 1/9] Adding remote server scan in admin --- app/admin/forms.py | 7 ++ app/admin/routes.py | 164 +++++++++++++++++++++++++++- app/admin/util.py | 6 +- app/templates/admin/federation.html | 21 +++- 4 files changed, 190 insertions(+), 8 deletions(-) diff --git a/app/admin/forms.py b/app/admin/forms.py index d23943a8..23ae5410 100644 --- a/app/admin/forms.py +++ b/app/admin/forms.py @@ -56,6 +56,13 @@ class PreLoadCommunitiesForm(FlaskForm): communities_num = IntegerField(_l('Number of Communities to add'), default=25) pre_load_submit = SubmitField(_l('Add Communities')) +class RemoteInstanceScanForm(FlaskForm): + remote_url = StringField(_l('Remote Server'), validators=[DataRequired()]) + communities_num = IntegerField(_l('Number of Communities to add'), default=25) + minimum_posts = IntegerField(_l('Communities must have at least this many posts'), default=100) + minimum_active_users = IntegerField(_l('Communities must have at least this many active users in the past week.'), default=100) + dry_run = BooleanField(_l('Dry Run')) + remote_scan_submit = SubmitField(_l('Scan')) class ImportExportBannedListsForm(FlaskForm): import_file = FileField(_l('Import Bans List Json File')) diff --git a/app/admin/routes.py b/app/admin/routes.py index 36497d22..40b9060f 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -17,7 +17,7 @@ from app.activitypub.signature import post_request, default_context from app.activitypub.util import instance_allowed, instance_blocked, extract_domain_and_actor from app.admin.forms import FederationForm, SiteMiscForm, SiteProfileForm, EditCommunityForm, EditUserForm, \ EditTopicForm, SendNewsletterForm, AddUserForm, PreLoadCommunitiesForm, ImportExportBannedListsForm, \ - EditInstanceForm + EditInstanceForm, RemoteInstanceScanForm from app.admin.util import unsubscribe_from_everything_then_delete, unsubscribe_from_community, send_newsletter, \ topics_for_form from app.community.util import save_icon_file, save_banner_file, search_for_community @@ -196,6 +196,7 @@ def admin_federation(): form = FederationForm() preload_form = PreLoadCommunitiesForm() ban_lists_form = ImportExportBannedListsForm() + remote_scan_form = RemoteInstanceScanForm() # this is the pre-load communities button if preload_form.pre_load_submit.data and preload_form.validate(): @@ -315,6 +316,165 @@ def admin_federation(): return redirect(url_for('admin.admin_federation')) + # this is the remote server scan + elif remote_scan_form.remote_scan_submit.data and remote_scan_form.validate(): + + # get the remote_url data + # TODO - validate that it is an https://fqdn + remote_url = remote_scan_form.remote_url.data + + # get dry run + dry_run = remote_scan_form.dry_run.data + + # get the number of follows requested + communities_num = remote_scan_form.communities_num.data + + # get the minimums + min_posts = remote_scan_form.minimum_posts.data + min_users = remote_scan_form.minimum_active_users.data + + # get the nodeinfo + resp = get_request(f'{remote_url}/.well-known/nodeinfo') + nodeinfo_dict = json.loads(resp.text) + + # check the ['links'] for instanceinfo url + schema2p0 = "http://nodeinfo.diaspora.software/ns/schema/2.0" + schema2p1 = "http://nodeinfo.diaspora.software/ns/schema/2.1" + for e in nodeinfo_dict['links']: + if e['rel'] == schema2p0 or e['rel'] == schema2p1: + remote_instanceinfo_url = e["href"] + + # get the instanceinfo + resp = get_request(remote_instanceinfo_url) + instanceinfo_dict = json.loads(resp.text) + + # determine the instance software + instance_software_name = instanceinfo_dict['software']['name'] + # instance_software_version = instanceinfo_dict['software']['version'] + + # if the instance is not running lemmy break for now as + # we dont yet support others for scanning + # TODO - add mbin support :-) + if instance_software_name != "lemmy": + flash(_(f"{remote_url} does not appear to be a lemmy instance.")) + return redirect(url_for('admin.admin_federation')) + + # get the siteinfo + resp = get_request(f'{remote_url}/api/v3/site') + siteinfo_dict = json.loads(resp.text) + + # get the num of communities + community_count = siteinfo_dict["site_view"]["counts"]["communities"] + + # lemmy has a hard-coded upper limit of 50 commnities + # in their api response + # do math to figure out how many requests to send to get all the communities + # if com count remainder limit == 0 it's an even division + # if not then divide and add one + if community_count % 50 == 0: + num_requests = community_count / 50 + else: + num_requests = community_count // 50 + num_requests += 1 + + # loop through and send the right number of requests to the remote endpoint + local_on_remote_instance = [] + comms_list = [] + for i in range(1,num_requests): + params = {"sort":"New","type_":"All","limit":"50","page":f"{i}"} + resp = get_request(f"{remote_url}/api/v3/community/list", params=params) + page_dict = json.loads(resp.text) + # get the individual communities out of the communities[] list in the response and + # add them to a holding list[] of our own + for c in page_dict["communities"]: + comms_list.append(c) + + # find all the communities that are local to the remote server + # being scanned + for c in comms_list: + if c["community"]["local"]: + local_on_remote_instance.append(c) + + # filter out the communities + already_known = list(db.session.execute(text('SELECT ap_public_url FROM "community"')).scalars()) + banned_urls = list(db.session.execute(text('SELECT domain FROM "banned_instances"')).scalars()) + seven_things_plus = [ + 'shit', 'piss', 'fuck', + 'cunt', 'cocksucker', 'motherfucker', 'tits', + 'memes', 'piracy', '196', 'greentext', 'usauthoritarianism', + 'enoughmuskspam', 'political_weirdos', '4chan' + ] + candidate_communities = [] + for community in local_on_remote_instance: + # get the relevant url bits + server, actor_id = extract_domain_and_actor(community["community"]["actor_id"]) + + # sort out already known communities + if community['community']['actor_id'] in already_known: + continue + # sort out the nsfw communities + elif community['community']['nsfw']: + continue + # sort out any that have less than minimum posts + elif community['counts']['posts'] < min_posts: + continue + # sort out any that do not have greater than the requested active users over the past week + elif community['counts']['users_active_week'] < min_users: + continue + # sort out any instances we have already banned + elif server in banned_urls: + continue + # sort out the 'seven things you can't say on tv' names (cursewords), plus some + # "low effort" communities + if any(badword in community['community']['name'].lower() for badword in seven_things_plus): + continue + else: + candidate_communities.append(community) + + # get the community urls to join + community_urls_to_join = [] + + # if the admin user wants more added than we have, then just add all of them + if communities_num > len(candidate_communities): + communities_num = len(candidate_communities) + + # make the list of urls + for i in range(communities_num): + community_urls_to_join.append(candidate_communities[i]['community']['actor_id'].lower()) + + # if its a dry run, just return the thing we /would/ do + if dry_run: + # message = f"Dry-Run: Would follow {len(community_urls_to_join)} of {len(local_on_remote_instance)} local communities on {remote_url}." + message = f"Dry-Run: remoteurl: {remote_url}, remote comms: {len(local_on_remote_instance)}, candidates: {len(candidate_communities)}, to join: {len(community_urls_to_join)}." + flash(_(message)) + return redirect(url_for('admin.admin_federation')) + + user = User.query.get(1) + remote_scan_messages = [] + for community in community_urls_to_join: + # get the relevant url bits + server, community = extract_domain_and_actor(community) + + # find the community + new_community = search_for_community('!' + community + '@' + server) + # subscribe to the community + # capture the messages returned by do_subscibe + # and show to user if instance is in debug mode + if current_app.debug: + message = do_subscribe(new_community.ap_id, user.id, admin_preload=True) + remote_scan_messages.append(message) + else: + message_we_wont_do_anything_with = do_subscribe.delay(new_community.ap_id, user.id, admin_preload=True) + + if current_app.debug: + flash(_('Results: %(results)s', results=str(remote_scan_messages))) + else: + flash( + _('Subscription process for %(communities_num)d of %(candidate_communities)d communities launched in background, check admin/activities for details', + communities_num=communities_num, candidate_communities=len(candidate_communities))) + + return redirect(url_for('admin.admin_federation')) + # this is the import bans button elif ban_lists_form.import_submit.data and ban_lists_form.validate(): import_file = request.files['import_file'] @@ -440,7 +600,7 @@ def admin_federation(): return render_template('admin/federation.html', title=_('Federation settings'), form=form, preload_form=preload_form, ban_lists_form=ban_lists_form, - current_app_debug=current_app.debug, + remote_scan_form=remote_scan_form, current_app_debug=current_app.debug, moderating_communities=moderating_communities(current_user.get_id()), joined_communities=joined_communities(current_user.get_id()), menu_topics=menu_topics(), diff --git a/app/admin/util.py b/app/admin/util.py index c3c068ad..55e8b2b1 100644 --- a/app/admin/util.py +++ b/app/admin/util.py @@ -8,8 +8,10 @@ from flask_babel import _ from app import db, cache, celery from app.activitypub.signature import post_request, default_context +from app.activitypub.util import extract_domain_and_actor + from app.models import User, Community, Instance, Site, ActivityPubLog, CommunityMember, Language -from app.utils import gibberish, topic_tree +from app.utils import gibberish, topic_tree, get_request def unsubscribe_from_everything_then_delete(user_id): @@ -124,5 +126,3 @@ def topics_for_form_children(topics, current_topic: int, depth: int) -> List[Tup result.extend(topics_for_form_children(topic['children'], current_topic, depth + 1)) return result - - diff --git a/app/templates/admin/federation.html b/app/templates/admin/federation.html index 92c4b4f7..34a7f981 100644 --- a/app/templates/admin/federation.html +++ b/app/templates/admin/federation.html @@ -17,7 +17,8 @@
-

Import / Export Bans

+

{{ _('Import / Export Bans') }}

+

Use this to import or export banned instances, domains, tags, and / or users.

JSON format:


             {
@@ -38,9 +39,23 @@
 
-

Use this to "pre-load" known threadiverse communities, as ranked by posts and activity. The list of communities pulls from the same list as LemmyVerse. NSFW communities and communities from banned instances are excluded.

+

{{ _('Remote Server Scan') }}

+

Use this to scan a remote lemmy server and "pre-load" it's communities, as ranked by posts and activity. NSFW communities and communities from banned instances are excluded. Communities with less than 100 posts and less than 500 active users in the past week are excluded.

+

Input should be in the form of https://server-name.tld

{% if current_app_debug %} -

*** This instance is in development mode. Loading more than 6 communities here could cause timeouts, depending on how your networking is setup. ***

+

*** This instance is in development mode. This function could cause timeouts depending on how your networking is setup. ***

+ {% endif %} + {{ render_form(remote_scan_form) }} +
+
+
+ +
+
+

{{ _('Load From Lemmyverse Data') }}

+

Use this to "pre-load" known threadiverse communities, as ranked by posts and activity. The list of communities pulls from the same list as LemmyVerse. NSFW communities and communities from banned instances are excluded. Communities with less than 100 posts and less than 500 active users in the past week are excluded.

+ {% if current_app_debug %} +

*** This instance is in development mode. This function could cause timeouts depending on how your networking is setup. ***

{% endif %} {{ render_form(preload_form) }}
From 553d5de60b67a64cd748e1847770cb9337607236 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Mon, 2 Dec 2024 14:38:50 -0500 Subject: [PATCH 2/9] adjusting dry run message --- app/admin/routes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/admin/routes.py b/app/admin/routes.py index 40b9060f..721ed526 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -444,8 +444,7 @@ def admin_federation(): # if its a dry run, just return the thing we /would/ do if dry_run: - # message = f"Dry-Run: Would follow {len(community_urls_to_join)} of {len(local_on_remote_instance)} local communities on {remote_url}." - message = f"Dry-Run: remoteurl: {remote_url}, remote comms: {len(local_on_remote_instance)}, candidates: {len(candidate_communities)}, to join: {len(community_urls_to_join)}." + message = f"Dry-Run: Remote Server - {remote_url}, Total Communities Found: {len(local_on_remote_instance)}, Communities to join based on current filters: {len(community_urls_to_join)}." flash(_(message)) return redirect(url_for('admin.admin_federation')) From 4138a8b41b272e0f714ec4232fabd6b6007189c4 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Mon, 2 Dec 2024 17:25:16 -0500 Subject: [PATCH 3/9] adding a few more tests for remote scan --- app/admin/forms.py | 1 + app/admin/routes.py | 66 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/app/admin/forms.py b/app/admin/forms.py index 23ae5410..1ee1ac1d 100644 --- a/app/admin/forms.py +++ b/app/admin/forms.py @@ -61,6 +61,7 @@ class RemoteInstanceScanForm(FlaskForm): communities_num = IntegerField(_l('Number of Communities to add'), default=25) minimum_posts = IntegerField(_l('Communities must have at least this many posts'), default=100) minimum_active_users = IntegerField(_l('Communities must have at least this many active users in the past week.'), default=100) + # allow_nsfw = BooleanField(_l('Allow NFSW'), default=False) dry_run = BooleanField(_l('Dry Run')) remote_scan_submit = SubmitField(_l('Scan')) diff --git a/app/admin/routes.py b/app/admin/routes.py index 721ed526..deb4a793 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -1,4 +1,5 @@ import os +import re from datetime import timedelta from time import sleep from io import BytesIO @@ -10,6 +11,7 @@ from flask_babel import _ from slugify import slugify from sqlalchemy import text, desc, or_ from PIL import Image +from urllib.parse import urlparse from app import db, celery, cache from app.activitypub.routes import process_inbox_request, process_delete_request, replay_inbox_request @@ -318,11 +320,35 @@ def admin_federation(): # this is the remote server scan elif remote_scan_form.remote_scan_submit.data and remote_scan_form.validate(): + # filters to be used later + already_known = list(db.session.execute(text('SELECT ap_public_url FROM "community"')).scalars()) + banned_urls = list(db.session.execute(text('SELECT domain FROM "banned_instances"')).scalars()) + seven_things_plus = [ + 'shit', 'piss', 'fuck', + 'cunt', 'cocksucker', 'motherfucker', 'tits', + 'memes', 'piracy', '196', 'greentext', 'usauthoritarianism', + 'enoughmuskspam', 'political_weirdos', '4chan' + ] # get the remote_url data - # TODO - validate that it is an https://fqdn remote_url = remote_scan_form.remote_url.data + # test to make sure its a valid fqdn + regex_pattern = '^(https:\/\/)(?=.{1,255}$)((.{1,63}\.){1,127}(?![0-9]*$)[a-z0-9-]+\.?)$' + result = re.match(regex_pattern, remote_url) + if result is None: + flash(_(f'{remote_url} does not appear to be a valid url. Make sure input is in the form https://server-name.tld without trailing slashes or paths.')) + return redirect(url_for('admin.admin_federation')) + + # check if its a banned instance + # Parse the URL + parsed_url = urlparse(remote_url) + # Extract the server domain name + server_domain = parsed_url.netloc + if server_domain in banned_urls: + flash(_(f'{remote_url} is a banned instance.')) + return redirect(url_for('admin.admin_federation')) + # get dry run dry_run = remote_scan_form.dry_run.data @@ -333,6 +359,9 @@ def admin_federation(): min_posts = remote_scan_form.minimum_posts.data min_users = remote_scan_form.minimum_active_users.data + # get nfsw + # allow_nsfw = remote_scan_form.allow_nsfw.data + # get the nodeinfo resp = get_request(f'{remote_url}/.well-known/nodeinfo') nodeinfo_dict = json.loads(resp.text) @@ -381,7 +410,7 @@ def admin_federation(): local_on_remote_instance = [] comms_list = [] for i in range(1,num_requests): - params = {"sort":"New","type_":"All","limit":"50","page":f"{i}"} + params = {"sort":"New","type_":"All","limit":"50","page":f"{i}","show_nsfw":"false"} resp = get_request(f"{remote_url}/api/v3/community/list", params=params) page_dict = json.loads(resp.text) # get the individual communities out of the communities[] list in the response and @@ -396,14 +425,7 @@ def admin_federation(): local_on_remote_instance.append(c) # filter out the communities - already_known = list(db.session.execute(text('SELECT ap_public_url FROM "community"')).scalars()) - banned_urls = list(db.session.execute(text('SELECT domain FROM "banned_instances"')).scalars()) - seven_things_plus = [ - 'shit', 'piss', 'fuck', - 'cunt', 'cocksucker', 'motherfucker', 'tits', - 'memes', 'piracy', '196', 'greentext', 'usauthoritarianism', - 'enoughmuskspam', 'political_weirdos', '4chan' - ] + already_known_count = nsfw_count = low_content_count = low_active_users_count = banned_count = bad_words_count = 0 candidate_communities = [] for community in local_on_remote_instance: # get the relevant url bits @@ -411,22 +433,28 @@ def admin_federation(): # sort out already known communities if community['community']['actor_id'] in already_known: + already_known_count += 1 continue # sort out the nsfw communities - elif community['community']['nsfw']: - continue + # elif community['community']['nsfw']: + # nsfw_count += 1 + # continue # sort out any that have less than minimum posts elif community['counts']['posts'] < min_posts: + low_content_count += 1 continue # sort out any that do not have greater than the requested active users over the past week elif community['counts']['users_active_week'] < min_users: + low_active_users_count += 1 continue # sort out any instances we have already banned - elif server in banned_urls: - continue + # elif server in banned_urls: + # banned_count += 1 + # continue # sort out the 'seven things you can't say on tv' names (cursewords), plus some # "low effort" communities if any(badword in community['community']['name'].lower() for badword in seven_things_plus): + bad_words_count += 1 continue else: candidate_communities.append(community) @@ -444,7 +472,15 @@ def admin_federation(): # if its a dry run, just return the thing we /would/ do if dry_run: - message = f"Dry-Run: Remote Server - {remote_url}, Total Communities Found: {len(local_on_remote_instance)}, Communities to join based on current filters: {len(community_urls_to_join)}." + message = f"Dry-Run for {remote_url}, \ + Total Communities the server knows about: {community_count}, \ + Local Communities on the server: {len(local_on_remote_instance)}, \ + Communities we already have: {already_known_count}, \ + Communities below minimum posts: {low_content_count}, \ + Communities below minimum users: {low_active_users_count}, \ + Candidate Communities based on filters: {len(candidate_communities)}, \ + Communities to join request: {communities_num}, \ + Communities to join based on current filters: {len(community_urls_to_join)}." flash(_(message)) return redirect(url_for('admin.admin_federation')) From 1ef0a1917c8ce048003c7f49a3fb8b207613748c Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Mon, 2 Dec 2024 18:00:22 -0500 Subject: [PATCH 4/9] comment cleanup --- app/admin/forms.py | 1 - app/admin/routes.py | 27 ++++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/app/admin/forms.py b/app/admin/forms.py index 1ee1ac1d..23ae5410 100644 --- a/app/admin/forms.py +++ b/app/admin/forms.py @@ -61,7 +61,6 @@ class RemoteInstanceScanForm(FlaskForm): communities_num = IntegerField(_l('Number of Communities to add'), default=25) minimum_posts = IntegerField(_l('Communities must have at least this many posts'), default=100) minimum_active_users = IntegerField(_l('Communities must have at least this many active users in the past week.'), default=100) - # allow_nsfw = BooleanField(_l('Allow NFSW'), default=False) dry_run = BooleanField(_l('Dry Run')) remote_scan_submit = SubmitField(_l('Scan')) diff --git a/app/admin/routes.py b/app/admin/routes.py index deb4a793..34619b74 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -337,10 +337,10 @@ def admin_federation(): regex_pattern = '^(https:\/\/)(?=.{1,255}$)((.{1,63}\.){1,127}(?![0-9]*$)[a-z0-9-]+\.?)$' result = re.match(regex_pattern, remote_url) if result is None: - flash(_(f'{remote_url} does not appear to be a valid url. Make sure input is in the form https://server-name.tld without trailing slashes or paths.')) + flash(_(f'{remote_url} does not appear to be a valid url. Make sure input is in the form "https://server-name.tld" without trailing slashes or paths.')) return redirect(url_for('admin.admin_federation')) - # check if its a banned instance + # check if it's a banned instance # Parse the URL parsed_url = urlparse(remote_url) # Extract the server domain name @@ -359,9 +359,6 @@ def admin_federation(): min_posts = remote_scan_form.minimum_posts.data min_users = remote_scan_form.minimum_active_users.data - # get nfsw - # allow_nsfw = remote_scan_form.allow_nsfw.data - # get the nodeinfo resp = get_request(f'{remote_url}/.well-known/nodeinfo') nodeinfo_dict = json.loads(resp.text) @@ -410,7 +407,7 @@ def admin_federation(): local_on_remote_instance = [] comms_list = [] for i in range(1,num_requests): - params = {"sort":"New","type_":"All","limit":"50","page":f"{i}","show_nsfw":"false"} + params = {"sort":"Active","type_":"All","limit":"50","page":f"{i}","show_nsfw":"false"} resp = get_request(f"{remote_url}/api/v3/community/list", params=params) page_dict = json.loads(resp.text) # get the individual communities out of the communities[] list in the response and @@ -425,20 +422,13 @@ def admin_federation(): local_on_remote_instance.append(c) # filter out the communities - already_known_count = nsfw_count = low_content_count = low_active_users_count = banned_count = bad_words_count = 0 + already_known_count = nsfw_count = low_content_count = low_active_users_count = bad_words_count = 0 candidate_communities = [] for community in local_on_remote_instance: - # get the relevant url bits - server, actor_id = extract_domain_and_actor(community["community"]["actor_id"]) - # sort out already known communities if community['community']['actor_id'] in already_known: already_known_count += 1 continue - # sort out the nsfw communities - # elif community['community']['nsfw']: - # nsfw_count += 1 - # continue # sort out any that have less than minimum posts elif community['counts']['posts'] < min_posts: low_content_count += 1 @@ -447,10 +437,6 @@ def admin_federation(): elif community['counts']['users_active_week'] < min_users: low_active_users_count += 1 continue - # sort out any instances we have already banned - # elif server in banned_urls: - # banned_count += 1 - # continue # sort out the 'seven things you can't say on tv' names (cursewords), plus some # "low effort" communities if any(badword in community['community']['name'].lower() for badword in seven_things_plus): @@ -470,9 +456,9 @@ def admin_federation(): for i in range(communities_num): community_urls_to_join.append(candidate_communities[i]['community']['actor_id'].lower()) - # if its a dry run, just return the thing we /would/ do + # if its a dry run, just return the stats if dry_run: - message = f"Dry-Run for {remote_url}, \ + message = f"Dry-Run for {remote_url}: \ Total Communities the server knows about: {community_count}, \ Local Communities on the server: {len(local_on_remote_instance)}, \ Communities we already have: {already_known_count}, \ @@ -489,7 +475,6 @@ def admin_federation(): for community in community_urls_to_join: # get the relevant url bits server, community = extract_domain_and_actor(community) - # find the community new_community = search_for_community('!' + community + '@' + server) # subscribe to the community From d3d9c2625af6740ede95eb4189ad4fe5aded9468 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Wed, 4 Dec 2024 16:44:19 -0500 Subject: [PATCH 5/9] adjusting the requesting loop --- app/admin/forms.py | 2 +- app/admin/routes.py | 59 ++++++++++++++++++--------------------------- 2 files changed, 24 insertions(+), 37 deletions(-) diff --git a/app/admin/forms.py b/app/admin/forms.py index 23ae5410..6d997af6 100644 --- a/app/admin/forms.py +++ b/app/admin/forms.py @@ -58,7 +58,7 @@ class PreLoadCommunitiesForm(FlaskForm): class RemoteInstanceScanForm(FlaskForm): remote_url = StringField(_l('Remote Server'), validators=[DataRequired()]) - communities_num = IntegerField(_l('Number of Communities to add'), default=25) + communities_requested = IntegerField(_l('Number of Communities to add'), default=25) minimum_posts = IntegerField(_l('Communities must have at least this many posts'), default=100) minimum_active_users = IntegerField(_l('Communities must have at least this many active users in the past week.'), default=100) dry_run = BooleanField(_l('Dry Run')) diff --git a/app/admin/routes.py b/app/admin/routes.py index 34619b74..21d8a02b 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -353,7 +353,7 @@ def admin_federation(): dry_run = remote_scan_form.dry_run.data # get the number of follows requested - communities_num = remote_scan_form.communities_num.data + communities_requested = remote_scan_form.communities_requested.data # get the minimums min_posts = remote_scan_form.minimum_posts.data @@ -385,46 +385,32 @@ def admin_federation(): flash(_(f"{remote_url} does not appear to be a lemmy instance.")) return redirect(url_for('admin.admin_federation')) - # get the siteinfo - resp = get_request(f'{remote_url}/api/v3/site') - siteinfo_dict = json.loads(resp.text) - - # get the num of communities - community_count = siteinfo_dict["site_view"]["counts"]["communities"] - # lemmy has a hard-coded upper limit of 50 commnities # in their api response - # do math to figure out how many requests to send to get all the communities - # if com count remainder limit == 0 it's an even division - # if not then divide and add one - if community_count % 50 == 0: - num_requests = community_count / 50 - else: - num_requests = community_count // 50 - num_requests += 1 - - # loop through and send the right number of requests to the remote endpoint - local_on_remote_instance = [] + # loop through and send off requests to the remote endpoint for 50 communities at a time comms_list = [] - for i in range(1,num_requests): - params = {"sort":"Active","type_":"All","limit":"50","page":f"{i}","show_nsfw":"false"} + page = 1 + get_more_communities = True + while get_more_communities: + params = {"sort":"Active","type_":"Local","limit":"50","page":f"{page}","show_nsfw":"false"} resp = get_request(f"{remote_url}/api/v3/community/list", params=params) page_dict = json.loads(resp.text) # get the individual communities out of the communities[] list in the response and # add them to a holding list[] of our own for c in page_dict["communities"]: comms_list.append(c) - - # find all the communities that are local to the remote server - # being scanned - for c in comms_list: - if c["community"]["local"]: - local_on_remote_instance.append(c) + # check the amount of items in the page_dict['communities'] list + # if it's lesss than 50 then we know its the last page of communities + # so we break the loop + if len(page_dict['communities']) < 50: + get_more_communities = False + else: + page += 1 # filter out the communities already_known_count = nsfw_count = low_content_count = low_active_users_count = bad_words_count = 0 candidate_communities = [] - for community in local_on_remote_instance: + for community in comms_list: # sort out already known communities if community['community']['actor_id'] in already_known: already_known_count += 1 @@ -449,23 +435,24 @@ def admin_federation(): community_urls_to_join = [] # if the admin user wants more added than we have, then just add all of them - if communities_num > len(candidate_communities): - communities_num = len(candidate_communities) + if communities_requested > len(candidate_communities): + communities_to_add = len(candidate_communities) + else: + communities_to_add = communities_requested # make the list of urls - for i in range(communities_num): + for i in range(communities_to_add): community_urls_to_join.append(candidate_communities[i]['community']['actor_id'].lower()) # if its a dry run, just return the stats if dry_run: message = f"Dry-Run for {remote_url}: \ - Total Communities the server knows about: {community_count}, \ - Local Communities on the server: {len(local_on_remote_instance)}, \ + Local Communities on the server: {len(comms_list)}, \ Communities we already have: {already_known_count}, \ Communities below minimum posts: {low_content_count}, \ Communities below minimum users: {low_active_users_count}, \ Candidate Communities based on filters: {len(candidate_communities)}, \ - Communities to join request: {communities_num}, \ + Communities to join request: {communities_requested}, \ Communities to join based on current filters: {len(community_urls_to_join)}." flash(_(message)) return redirect(url_for('admin.admin_federation')) @@ -490,8 +477,8 @@ def admin_federation(): flash(_('Results: %(results)s', results=str(remote_scan_messages))) else: flash( - _('Subscription process for %(communities_num)d of %(candidate_communities)d communities launched in background, check admin/activities for details', - communities_num=communities_num, candidate_communities=len(candidate_communities))) + _('Subscription process for %(communities_requested)d of %(candidate_communities)d communities launched in background, check admin/activities for details', + communities_requested=communities_requested, candidate_communities=len(candidate_communities))) return redirect(url_for('admin.admin_federation')) From cf709ec6da1d985c3f3b6e80be30f73c02719ef6 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Wed, 4 Dec 2024 17:17:18 -0500 Subject: [PATCH 6/9] adjusting join message --- app/admin/routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/admin/routes.py b/app/admin/routes.py index 21d8a02b..05810300 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -477,8 +477,8 @@ def admin_federation(): flash(_('Results: %(results)s', results=str(remote_scan_messages))) else: flash( - _('Subscription process for %(communities_requested)d of %(candidate_communities)d communities launched in background, check admin/activities for details', - communities_requested=communities_requested, candidate_communities=len(candidate_communities))) + _('Based on current filters, the subscription process for %(communities_to_join)d of %(candidate_communities)d communities launched in background, check admin/activities for details', + communities_to_join=len(community_urls_to_join), candidate_communities=len(candidate_communities))) return redirect(url_for('admin.admin_federation')) From 422d494998b32cf60a336a77ad1f8bcd515bc3d0 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Wed, 4 Dec 2024 18:36:36 -0500 Subject: [PATCH 7/9] Adding mbin support to the remote server scan function --- app/admin/routes.py | 245 ++++++++++++++++++++++++++++++-------------- 1 file changed, 170 insertions(+), 75 deletions(-) diff --git a/app/admin/routes.py b/app/admin/routes.py index 05810300..9046375b 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -329,6 +329,9 @@ def admin_federation(): 'memes', 'piracy', '196', 'greentext', 'usauthoritarianism', 'enoughmuskspam', 'political_weirdos', '4chan' ] + is_lemmy = False + is_mbin = False + # get the remote_url data remote_url = remote_scan_form.remote_url.data @@ -378,85 +381,177 @@ def admin_federation(): instance_software_name = instanceinfo_dict['software']['name'] # instance_software_version = instanceinfo_dict['software']['version'] - # if the instance is not running lemmy break for now as + # if the instance is not running lemmy or mbin break for now as # we dont yet support others for scanning - # TODO - add mbin support :-) - if instance_software_name != "lemmy": - flash(_(f"{remote_url} does not appear to be a lemmy instance.")) - return redirect(url_for('admin.admin_federation')) - - # lemmy has a hard-coded upper limit of 50 commnities - # in their api response - # loop through and send off requests to the remote endpoint for 50 communities at a time - comms_list = [] - page = 1 - get_more_communities = True - while get_more_communities: - params = {"sort":"Active","type_":"Local","limit":"50","page":f"{page}","show_nsfw":"false"} - resp = get_request(f"{remote_url}/api/v3/community/list", params=params) - page_dict = json.loads(resp.text) - # get the individual communities out of the communities[] list in the response and - # add them to a holding list[] of our own - for c in page_dict["communities"]: - comms_list.append(c) - # check the amount of items in the page_dict['communities'] list - # if it's lesss than 50 then we know its the last page of communities - # so we break the loop - if len(page_dict['communities']) < 50: - get_more_communities = False - else: - page += 1 - - # filter out the communities - already_known_count = nsfw_count = low_content_count = low_active_users_count = bad_words_count = 0 - candidate_communities = [] - for community in comms_list: - # sort out already known communities - if community['community']['actor_id'] in already_known: - already_known_count += 1 - continue - # sort out any that have less than minimum posts - elif community['counts']['posts'] < min_posts: - low_content_count += 1 - continue - # sort out any that do not have greater than the requested active users over the past week - elif community['counts']['users_active_week'] < min_users: - low_active_users_count += 1 - continue - # sort out the 'seven things you can't say on tv' names (cursewords), plus some - # "low effort" communities - if any(badword in community['community']['name'].lower() for badword in seven_things_plus): - bad_words_count += 1 - continue - else: - candidate_communities.append(community) - - # get the community urls to join - community_urls_to_join = [] - - # if the admin user wants more added than we have, then just add all of them - if communities_requested > len(candidate_communities): - communities_to_add = len(candidate_communities) + if instance_software_name == "lemmy": + is_lemmy = True + elif instance_software_name == "mbin": + is_mbin = True else: - communities_to_add = communities_requested - - # make the list of urls - for i in range(communities_to_add): - community_urls_to_join.append(candidate_communities[i]['community']['actor_id'].lower()) - - # if its a dry run, just return the stats - if dry_run: - message = f"Dry-Run for {remote_url}: \ - Local Communities on the server: {len(comms_list)}, \ - Communities we already have: {already_known_count}, \ - Communities below minimum posts: {low_content_count}, \ - Communities below minimum users: {low_active_users_count}, \ - Candidate Communities based on filters: {len(candidate_communities)}, \ - Communities to join request: {communities_requested}, \ - Communities to join based on current filters: {len(community_urls_to_join)}." - flash(_(message)) + flash(_(f"{remote_url} does not appear to be a lemmy or mbin instance.")) return redirect(url_for('admin.admin_federation')) + if is_lemmy: + # lemmy has a hard-coded upper limit of 50 commnities + # in their api response + # loop through and send off requests to the remote endpoint for 50 communities at a time + comms_list = [] + page = 1 + get_more_communities = True + while get_more_communities: + params = {"sort":"Active","type_":"Local","limit":"50","page":f"{page}","show_nsfw":"false"} + resp = get_request(f"{remote_url}/api/v3/community/list", params=params) + page_dict = json.loads(resp.text) + # get the individual communities out of the communities[] list in the response and + # add them to a holding list[] of our own + for c in page_dict["communities"]: + comms_list.append(c) + # check the amount of items in the page_dict['communities'] list + # if it's lesss than 50 then we know its the last page of communities + # so we break the loop + if len(page_dict['communities']) < 50: + get_more_communities = False + else: + page += 1 + + # filter out the communities + already_known_count = nsfw_count = low_content_count = low_active_users_count = bad_words_count = 0 + candidate_communities = [] + for community in comms_list: + # sort out already known communities + if community['community']['actor_id'] in already_known: + already_known_count += 1 + continue + # sort out any that have less than minimum posts + elif community['counts']['posts'] < min_posts: + low_content_count += 1 + continue + # sort out any that do not have greater than the requested active users over the past week + elif community['counts']['users_active_week'] < min_users: + low_active_users_count += 1 + continue + # sort out the 'seven things you can't say on tv' names (cursewords), plus some + # "low effort" communities + if any(badword in community['community']['name'].lower() for badword in seven_things_plus): + bad_words_count += 1 + continue + else: + candidate_communities.append(community) + + # get the community urls to join + community_urls_to_join = [] + + # if the admin user wants more added than we have, then just add all of them + if communities_requested > len(candidate_communities): + communities_to_add = len(candidate_communities) + else: + communities_to_add = communities_requested + + # make the list of urls + for i in range(communities_to_add): + community_urls_to_join.append(candidate_communities[i]['community']['actor_id'].lower()) + + # if its a dry run, just return the stats + if dry_run: + message = f"Dry-Run for {remote_url}: \ + Local Communities on the server: {len(comms_list)}, \ + Communities we already have: {already_known_count}, \ + Communities below minimum posts: {low_content_count}, \ + Communities below minimum users: {low_active_users_count}, \ + Candidate Communities based on filters: {len(candidate_communities)}, \ + Communities to join request: {communities_requested}, \ + Communities to join based on current filters: {len(community_urls_to_join)}." + flash(_(message)) + return redirect(url_for('admin.admin_federation')) + + if is_mbin: + # hit the /api/magazines with a single call to get the stats for counts and num_requests + # mbin does not have the hard-coded limit, but lets stick with 50 to match lemmy + params = {"p":"1","perPage":"50","sort":"active","federation":"local","hide_adult":"hide"} + resp = get_request(f"{remote_url}/api/magazines", params=params) + page_dict = json.loads(resp.text) + + # get the number of requests to send + # num_requests = page_dict['pagination']['maxPage'] + + # loop through and send the right number of requests to the remote endpoint + # local_on_remote_instance = [] + mags_list = [] + page = 1 + get_more_magazines = True + while get_more_magazines: + params = {"p":f"{page}","perPage":"50","sort":"active","federation":"local","hide_adult":"hide"} + resp = get_request(f"{remote_url}/api/magazines", params=params) + page_dict = json.loads(resp.text) + # get the individual magazines out of the items[] list in the response and + # add them to a holding list[] of our own + for m in page_dict['items']: + mags_list.append(m) + # check the amount of items in the page_dict['items'] list + # if it's lesss than 50 then we know its the last page of magazines + # so we break the loop + if len(page_dict['items']) < 50: + get_more_magazines = False + else: + page += 1 + + + # filter out the magazines + already_known_count = low_content_count = low_subscribed_users_count = bad_words_count = 0 + candidate_magazines = [] + for magazine in mags_list: + # sort out already known communities + if magazine['apProfileId'] in already_known: + already_known_count += 1 + continue + # sort out any that have less than minimum posts + elif magazine['entryCount'] < min_posts: + low_content_count += 1 + continue + # sort out any that do not have greater than the requested users over the past week + # mbin does not show active users here, so its based on subscriber count + elif magazine['subscriptionsCount'] < min_users: + low_subscribed_users_count += 1 + continue + # sort out the 'seven things you can't say on tv' names (cursewords), plus some + # "low effort" communities + if any(badword in magazine['name'].lower() for badword in seven_things_plus): + bad_words_count += 1 + continue + else: + candidate_magazines.append(magazine) + + # testing + # flash(_(f"testing: candidate mags {len(candidate_magazines)}, m0: {candidate_magazines[0]}")) + # return redirect(url_for('admin.admin_federation')) + + # get the community urls to join + community_urls_to_join = [] + + # if the admin user wants more added than we have, then just add all of them + if communities_requested > len(candidate_magazines): + magazines_to_add = len(candidate_magazines) + else: + magazines_to_add = communities_requested + + # make the list of urls + for i in range(magazines_to_add): + community_urls_to_join.append(candidate_magazines[i]['apProfileId'].lower()) + + # if its a dry run, just return the stats + if dry_run: + message = f"Dry-Run for {remote_url}: \ + Local Magazines on the server: {len(mags_list)}, \ + Magazines we already have: {already_known_count}, \ + Magazines below minimum posts: {low_content_count}, \ + Magazines below minimum users: {low_subscribed_users_count}, \ + Candidate Magazines based on filters: {len(candidate_magazines)}, \ + Magazines to join request: {communities_requested}, \ + Magazines to join based on current filters: {len(community_urls_to_join)}." + flash(_(message)) + return redirect(url_for('admin.admin_federation')) + + user = User.query.get(1) remote_scan_messages = [] for community in community_urls_to_join: From 27915b654dfad06476c94e7999463873bc4fb28e Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Wed, 4 Dec 2024 18:54:22 -0500 Subject: [PATCH 8/9] adjusting var names --- app/admin/routes.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/app/admin/routes.py b/app/admin/routes.py index 9046375b..48b170c4 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -467,9 +467,9 @@ def admin_federation(): if is_mbin: # hit the /api/magazines with a single call to get the stats for counts and num_requests # mbin does not have the hard-coded limit, but lets stick with 50 to match lemmy - params = {"p":"1","perPage":"50","sort":"active","federation":"local","hide_adult":"hide"} - resp = get_request(f"{remote_url}/api/magazines", params=params) - page_dict = json.loads(resp.text) + # params = {"p":"1","perPage":"50","sort":"active","federation":"local","hide_adult":"hide"} + # resp = get_request(f"{remote_url}/api/magazines", params=params) + # page_dict = json.loads(resp.text) # get the number of requests to send # num_requests = page_dict['pagination']['maxPage'] @@ -498,7 +498,7 @@ def admin_federation(): # filter out the magazines already_known_count = low_content_count = low_subscribed_users_count = bad_words_count = 0 - candidate_magazines = [] + candidate_communities = [] for magazine in mags_list: # sort out already known communities if magazine['apProfileId'] in already_known: @@ -519,7 +519,7 @@ def admin_federation(): bad_words_count += 1 continue else: - candidate_magazines.append(magazine) + candidate_communities.append(magazine) # testing # flash(_(f"testing: candidate mags {len(candidate_magazines)}, m0: {candidate_magazines[0]}")) @@ -529,14 +529,14 @@ def admin_federation(): community_urls_to_join = [] # if the admin user wants more added than we have, then just add all of them - if communities_requested > len(candidate_magazines): - magazines_to_add = len(candidate_magazines) + if communities_requested > len(candidate_communities): + magazines_to_add = len(candidate_communities) else: magazines_to_add = communities_requested # make the list of urls for i in range(magazines_to_add): - community_urls_to_join.append(candidate_magazines[i]['apProfileId'].lower()) + community_urls_to_join.append(candidate_communities[i]['apProfileId'].lower()) # if its a dry run, just return the stats if dry_run: @@ -545,7 +545,7 @@ def admin_federation(): Magazines we already have: {already_known_count}, \ Magazines below minimum posts: {low_content_count}, \ Magazines below minimum users: {low_subscribed_users_count}, \ - Candidate Magazines based on filters: {len(candidate_magazines)}, \ + Candidate Magazines based on filters: {len(candidate_communities)}, \ Magazines to join request: {communities_requested}, \ Magazines to join based on current filters: {len(community_urls_to_join)}." flash(_(message)) @@ -560,7 +560,7 @@ def admin_federation(): # find the community new_community = search_for_community('!' + community + '@' + server) # subscribe to the community - # capture the messages returned by do_subscibe + # capture the messages returned by do_subscribe # and show to user if instance is in debug mode if current_app.debug: message = do_subscribe(new_community.ap_id, user.id, admin_preload=True) From fb8dc9e8cb7850ca31c674a5de813f71130529d1 Mon Sep 17 00:00:00 2001 From: aroberts-fox Date: Wed, 4 Dec 2024 19:01:08 -0500 Subject: [PATCH 9/9] comment cleanup --- app/admin/routes.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/app/admin/routes.py b/app/admin/routes.py index 48b170c4..8e9d858c 100644 --- a/app/admin/routes.py +++ b/app/admin/routes.py @@ -465,17 +465,8 @@ def admin_federation(): return redirect(url_for('admin.admin_federation')) if is_mbin: - # hit the /api/magazines with a single call to get the stats for counts and num_requests + # loop through and send the right number of requests to the remote endpoint for mbin # mbin does not have the hard-coded limit, but lets stick with 50 to match lemmy - # params = {"p":"1","perPage":"50","sort":"active","federation":"local","hide_adult":"hide"} - # resp = get_request(f"{remote_url}/api/magazines", params=params) - # page_dict = json.loads(resp.text) - - # get the number of requests to send - # num_requests = page_dict['pagination']['maxPage'] - - # loop through and send the right number of requests to the remote endpoint - # local_on_remote_instance = [] mags_list = [] page = 1 get_more_magazines = True @@ -494,7 +485,6 @@ def admin_federation(): get_more_magazines = False else: page += 1 - # filter out the magazines already_known_count = low_content_count = low_subscribed_users_count = bad_words_count = 0 @@ -521,10 +511,6 @@ def admin_federation(): else: candidate_communities.append(magazine) - # testing - # flash(_(f"testing: candidate mags {len(candidate_magazines)}, m0: {candidate_magazines[0]}")) - # return redirect(url_for('admin.admin_federation')) - # get the community urls to join community_urls_to_join = [] @@ -551,7 +537,6 @@ def admin_federation(): flash(_(message)) return redirect(url_for('admin.admin_federation')) - user = User.query.get(1) remote_scan_messages = [] for community in community_urls_to_join: