From bb9059bf704c28e8705c711f82ae3f32e8db396f Mon Sep 17 00:00:00 2001 From: freamon Date: Sat, 21 Sep 2024 09:37:54 +0000 Subject: [PATCH 1/4] Remove code that added comment from autotl;dr bot to post body (bot was disabled 23/07) --- app/activitypub/util.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/app/activitypub/util.py b/app/activitypub/util.py index 45f357a9..5ff2f3e2 100644 --- a/app/activitypub/util.py +++ b/app/activitypub/util.py @@ -1579,17 +1579,6 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep return None post = Post.query.get(post_id) - # special case: add comment from auto-tldr bot to post body if body is empty - if user.ap_id == 'autotldr@lemmings.world': - if not post.body or (post.body and post.body.strip() == ''): - if not '::: spoiler' in post_reply.body: - post.body = "🤖 I'm a bot that provides automatic summaries for articles:\n::: spoiler Click here to see the summary\n" + post_reply.body + '\n:::' - else: - post.body = post_reply.body - post.body_html = lemmy_markdown_to_html(post.body) + '\n\nGenerated using AI by: AutoTL;DR' - db.session.commit() - return None - if post.comments_enabled: anchor = None if not parent_comment_id: From 5e2ec8f9d9b10aae164b7212438c62f79ef1e6f6 Mon Sep 17 00:00:00 2001 From: freamon Date: Sat, 21 Sep 2024 20:05:34 +0000 Subject: [PATCH 2/4] Remove last remaining uses of lemmy_markdown_to_html Everything is now piped through allowlist (except spoiler MD as Lemmy doesn't convert the MD in the contents) --- app/activitypub/routes.py | 4 +- app/activitypub/util.py | 127 +++++++++++++++++++++++++------------- app/utils.py | 18 ------ 3 files changed, 85 insertions(+), 64 deletions(-) diff --git a/app/activitypub/routes.py b/app/activitypub/routes.py index dd1d3457..fe8c1855 100644 --- a/app/activitypub/routes.py +++ b/app/activitypub/routes.py @@ -29,7 +29,7 @@ from app.activitypub.util import public_key, users_total, active_half_year, acti from app.utils import gibberish, get_setting, render_template, \ community_membership, ap_datetime, ip_address, can_downvote, \ can_upvote, can_create_post, awaken_dormant_instance, shorten_string, can_create_post_reply, sha256_digest, \ - community_moderators, lemmy_markdown_to_html + community_moderators, markdown_to_html @bp.route('/testredis') @@ -508,7 +508,7 @@ def process_inbox_request(request_json, activitypublog_id, ip_address): encrypted = request_json['object']['encrypted'] if 'encrypted' in request_json['object'] else None new_message = ChatMessage(sender_id=sender.id, recipient_id=recipient.id, conversation_id=existing_conversation.id, body=request_json['object']['source']['content'], - body_html=lemmy_markdown_to_html(request_json['object']['source']['content']), + body_html=markdown_to_html(request_json['object']['source']['content']), encrypted=encrypted) db.session.add(new_message) existing_conversation.updated_at = utcnow() diff --git a/app/activitypub/util.py b/app/activitypub/util.py index 5ff2f3e2..4cf39aa4 100644 --- a/app/activitypub/util.py +++ b/app/activitypub/util.py @@ -28,7 +28,7 @@ from app.utils import get_request, allowlist_html, get_setting, ap_datetime, mar is_image_url, domain_from_url, gibberish, ensure_directory_exists, markdown_to_text, head_request, post_ranking, \ shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \ blocked_phrases, microblog_content_to_title, generate_image_from_video_url, is_video_url, reply_is_stupid, \ - notification_subscribers, communities_banned_from, lemmy_markdown_to_html, actor_contains_blocked_words, \ + notification_subscribers, communities_banned_from, actor_contains_blocked_words, \ html_to_text, opengraph_parse, url_to_thumbnail_file, add_to_modlog_activitypub, joined_communities, \ moderating_communities, is_video_hosting_site @@ -476,7 +476,19 @@ def refresh_user_profile_task(user_id): user.user_name = activity_json['preferredUsername'] if 'name' in activity_json: user.title = activity_json['name'] - user.about_html = parse_summary(activity_json) + if 'summary' in activity_json: + about_html = activity_json['summary'] + if not about_html.startswith('<'): # PeerTube + about_html = '

' + about_html + '

' + user.about_html = allowlist_html(about_html) + else: + user.about_html = '' + if 'source' in activity_json and activity_json['source'].get('mediaType') == 'text/markdown': + user.about = activity_json['source']['content'] + if '::: spoiler' in user.about: + user.about_html = markdown_to_html(user.about) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well + else: + user.about = html_to_text(user.about_html) user.ap_fetched_at = utcnow() user.public_key = activity_json['publicKey']['publicKeyPem'] user.indexable = new_indexable @@ -550,10 +562,6 @@ def refresh_community_profile_task(community_id): if 'nsfl' in activity_json and activity_json['nsfl']: community.nsfl = activity_json['nsfl'] community.title = activity_json['name'] - community.description = activity_json['summary'] if 'summary' in activity_json else '' - community.description_html = markdown_to_html(community.description) - community.rules = activity_json['rules'] if 'rules' in activity_json else '' - community.rules_html = lemmy_markdown_to_html(activity_json['rules'] if 'rules' in activity_json else '') community.restricted_to_mods = activity_json['postingRestrictedToMods'] if 'postingRestrictedToMods' in activity_json else False community.new_mods_wanted = activity_json['newModsWanted'] if 'newModsWanted' in activity_json else False community.private_mods = activity_json['privateMods'] if 'privateMods' in activity_json else False @@ -561,16 +569,28 @@ def refresh_community_profile_task(community_id): community.ap_fetched_at = utcnow() community.public_key=activity_json['publicKey']['publicKeyPem'] - if 'source' in activity_json and \ - activity_json['source']['mediaType'] == 'text/markdown': - community.description = activity_json['source']['content'] - community.description_html = lemmy_markdown_to_html(community.description) + description_html = '' + if 'summary' in activity_json: + description_html = activity_json['summary'] elif 'content' in activity_json: - community.description_html = allowlist_html(activity_json['content']) - community.description = '' + description_html = activity_json['content'] + else: + description_html = '' - if community.description and community.description.startswith('

'): - community.description_html = allowlist_html(community.description) + if description_html != '': + if not description_html.startswith('<'): # PeerTube + description_html = '

' + description_html + '

' + community.description_html = allowlist_html(description_html) + if 'source' in activity_json and activity_json['source'].get('mediaType') == 'text/markdown': + community.description = activity_json['source']['content'] + if '::: spoiler' in community.description: + community.description_html = markdown_to_html(community.description) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well + else: + community.description = html_to_text(community.description_html) + + if 'rules' in activity_json: + community.rules_html = allowlist_html(activity_json['rules']) + community.rules = html_to_text(community.rules_html) icon_changed = cover_changed = False if 'icon' in activity_json: @@ -659,7 +679,6 @@ def actor_json_to_model(activity_json, address, server): user = User(user_name=activity_json['preferredUsername'], title=activity_json['name'] if 'name' in activity_json else None, email=f"{address}@{server}", - about_html=parse_summary(activity_json), matrix_user_id=activity_json['matrixUserId'] if 'matrixUserId' in activity_json else '', indexable=activity_json['indexable'] if 'indexable' in activity_json else True, searchable=activity_json['discoverable'] if 'discoverable' in activity_json else True, @@ -682,6 +701,20 @@ def actor_json_to_model(activity_json, address, server): current_app.logger.error(f'KeyError for {address}@{server} while parsing ' + str(activity_json)) return None + if 'summary' in activity_json: + about_html = activity_json['summary'] + if not about_html.startswith('<'): # PeerTube + about_html = '

' + about_html + '

' + user.about_html = allowlist_html(about_html) + else: + user.about_html = '' + if 'source' in activity_json and activity_json['source'].get('mediaType') == 'text/markdown': + user.about = activity_json['source']['content'] + if '::: spoiler' in user.about: + user.about_html = markdown_to_html(user.about) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well + else: + user.about = html_to_text(user.about_html) + if 'icon' in activity_json and activity_json['icon'] is not None: if isinstance(activity_json['icon'], dict) and 'url' in activity_json['icon']: icon_entry = activity_json['icon']['url'] @@ -723,9 +756,6 @@ def actor_json_to_model(activity_json, address, server): community = Community(name=activity_json['preferredUsername'], title=activity_json['name'], - description=activity_json['summary'] if 'summary' in activity_json else '', - rules=activity_json['rules'] if 'rules' in activity_json else '', - rules_html=lemmy_markdown_to_html(activity_json['rules'] if 'rules' in activity_json else ''), nsfw=activity_json['sensitive'] if 'sensitive' in activity_json else False, restricted_to_mods=activity_json['postingRestrictedToMods'] if 'postingRestrictedToMods' in activity_json else False, new_mods_wanted=activity_json['newModsWanted'] if 'newModsWanted' in activity_json else False, @@ -747,18 +777,30 @@ def actor_json_to_model(activity_json, address, server): instance_id=find_instance_id(server), low_quality='memes' in activity_json['preferredUsername'] ) - if community.description.startswith('

'): - community.description_html = allowlist_html(community.description) - else: - community.description_html = markdown_to_html(community.description) - # parse markdown and overwrite html field with result - if 'source' in activity_json and \ - activity_json['source']['mediaType'] == 'text/markdown': - community.description = activity_json['source']['content'] - community.description_html = lemmy_markdown_to_html(community.description) + + description_html = '' + if 'summary' in activity_json: + description_html = activity_json['summary'] elif 'content' in activity_json: - community.description_html = allowlist_html(activity_json['content']) - community.description = '' + description_html = activity_json['content'] + else: + description_html = '' + + if description_html != '': + if not description_html.startswith('<'): # PeerTube + description_html = '

' + description_html + '

' + community.description_html = allowlist_html(description_html) + if 'source' in activity_json and activity_json['source'].get('mediaType') == 'text/markdown': + community.description = activity_json['source']['content'] + if '::: spoiler' in community.description: + community.description_html = markdown_to_html(community.description) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well + else: + community.description = html_to_text(community.description_html) + + if 'rules' in activity_json: + community.rules_html = allowlist_html(activity_json['rules']) + community.rules = html_to_text(community.rules_html) + if 'icon' in activity_json and activity_json['icon'] is not None: if isinstance(activity_json['icon'], dict) and 'url' in activity_json['icon']: icon_entry = activity_json['icon']['url'] @@ -816,6 +858,8 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post: post.body_html = allowlist_html(post_json['content']) if 'source' in post_json and post_json['source']['mediaType'] == 'text/markdown': post.body = post_json['source']['content'] + if '::: spoiler' in post.body: + post.body_html = markdown_to_html(post.body) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well else: post.body = html_to_text(post.body_html) elif post_json['mediaType'] == 'text/markdown': @@ -1063,19 +1107,6 @@ def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory, to db.session.commit() -# create a summary from markdown if present, otherwise use html if available -def parse_summary(user_json) -> str: - if 'source' in user_json and user_json['source'].get('mediaType') == 'text/markdown': - # Convert Markdown to HTML - markdown_text = user_json['source']['content'] - html_content = lemmy_markdown_to_html(markdown_text) - return html_content - elif 'summary' in user_json: - return allowlist_html(user_json['summary']) - else: - return '' - - def find_reply_parent(in_reply_to: str) -> Tuple[int, int, int]: if 'comment' in in_reply_to: parent_comment = PostReply.get_by_ap_id(in_reply_to) @@ -1290,7 +1321,7 @@ def delete_post_or_comment_task(user_ap_id, community_ap_id, to_be_deleted_ap_id to_delete.post.reply_count -= 1 if to_delete.has_replies(): to_delete.body = 'Deleted by author' if to_delete.author.id == deletor.id else 'Deleted by moderator' - to_delete.body_html = lemmy_markdown_to_html(to_delete.body) + to_delete.body_html = markdown_to_html(to_delete.body) else: to_delete.delete_dependencies() to_delete.deleted = True @@ -1363,7 +1394,7 @@ def remove_data_from_banned_user_task(deletor_ap_id, user_ap_id, target): post_reply.post.reply_count -= 1 if post_reply.has_replies(): post_reply.body = 'Banned' - post_reply.body_html = lemmy_markdown_to_html(post_reply.body) + post_reply.body_html = markdown_to_html(post_reply.body) else: post_reply.delete_dependencies() post_reply.deleted = True @@ -1560,6 +1591,8 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \ 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown': post_reply.body = request_json['object']['source']['content'] + if '::: spoiler' in post_reply.body: + post_reply.body_html = markdown_to_html(post_reply.body) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well else: post_reply.body = html_to_text(post_reply.body_html) # Language - Lemmy uses 'language' while Mastodon uses 'contentMap' @@ -1678,6 +1711,8 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json post.body_html = allowlist_html(request_json['object']['content']) if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': post.body = request_json['object']['source']['content'] + if '::: spoiler' in post.body: + post.body_html = markdown_to_html(post.body) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well else: post.body = html_to_text(post.body_html) elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown': @@ -1922,6 +1957,8 @@ def update_post_reply_from_activity(reply: PostReply, request_json: dict): if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \ 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown': reply.body = request_json['object']['source']['content'] + if '::: spoiler' in reply.body: + reply.body_html = markdown_to_html(reply.body) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well else: reply.body = html_to_text(reply.body_html) # Language @@ -1945,6 +1982,8 @@ def update_post_from_activity(post: Post, request_json: dict): post.body_html = allowlist_html(request_json['object']['content']) if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': post.body = request_json['object']['source']['content'] + if '::: spoiler' in post.body: + post.body_html = markdown_to_html(post.body) # overwrite as Lemmy doesn't convert spoiler contents into HTML very well else: post.body = html_to_text(post.body_html) elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown': diff --git a/app/utils.py b/app/utils.py index ef022242..2cee9834 100644 --- a/app/utils.py +++ b/app/utils.py @@ -317,24 +317,6 @@ def markdown_to_html(markdown_text, anchors_new_tab=True) -> str: return '' -# Have started process of replacing this function, and just using Lemmy's HTML 'content' field, same as other platforms that only provide that. -# Lemmy's MD supports line breaks as SPACE-SPACE-NEWLINE or SPACE-BACKSLASH-NEWLINE but Markdown2 can't support both: without the 'breaks' -# extra, it doesn't translate SPACE-BACKSLASH-NEWLINE to
, but with it it doesn't translate SPACE-SPACE-NEWLINE to
- -# done so far: post bodies (backfilled), post bodies (create), post bodies (edit), replies (create), replies (edit) -# not done yet: user profiles, community descriptions, chat messages, over-writing with 'banned' or 'deleted by author', replies from autotl;dr bot - -# this is for lemmy's version of Markdown (can be removed in future - when HTML from them filtered through an allow_list is used, instead of MD) -def lemmy_markdown_to_html(markdown_text) -> str: - if markdown_text: - raw_html = markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True, - 'fenced-code-blocks': True, 'strike': True, - 'breaks': {'on_newline': False, 'on_backslash': True}}) - return allowlist_html(raw_html) - else: - return '' - - def markdown_to_text(markdown_text) -> str: if not markdown_text or markdown_text == '': return '' From bdb201f4d3f7fa67c5a51e52c291b15921bd393a Mon Sep 17 00:00:00 2001 From: freamon Date: Sat, 21 Sep 2024 20:12:21 +0000 Subject: [PATCH 3/4] Remove safe_mode=True for Markdown2, as most things go straight through allowlist_html, and it mangles code in blocks --- app/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils.py b/app/utils.py index 2cee9834..e0199917 100644 --- a/app/utils.py +++ b/app/utils.py @@ -310,7 +310,7 @@ def allowlist_html(html: str, a_target='_blank') -> str: # this is for pyfedi's version of Markdown (differs from lemmy for: newlines for soft breaks, ...) def markdown_to_html(markdown_text, anchors_new_tab=True) -> str: if markdown_text: - raw_html = markdown2.markdown(markdown_text, safe_mode=True, + raw_html = markdown2.markdown(markdown_text, extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True, 'breaks': {'on_newline': True, 'on_backslash': True}}) return allowlist_html(raw_html, a_target='_blank' if anchors_new_tab else '') else: From 646bcdf7be542af334b10d1b66bcbaa6ec1b159c Mon Sep 17 00:00:00 2001 From: freamon Date: Sat, 21 Sep 2024 21:03:08 +0000 Subject: [PATCH 4/4] avoid wrapping anchors around existing anchors (e.g. if raw URL already wrapped by remote PieFed instance) --- app/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/utils.py b/app/utils.py index e0199917..5ddc8819 100644 --- a/app/utils.py +++ b/app/utils.py @@ -284,6 +284,10 @@ def allowlist_html(html: str, a_target='_blank') -> str: clean_html = str(soup) + # avoid wrapping anchors around existing anchors (e.g. if raw URL already wrapped by remote PieFed instance) + re_double_anchor = re.compile(r'(.*?<\/a>)<\/a>') + clean_html = re_double_anchor.sub(r'\1', clean_html) + # avoid returning empty anchors re_empty_anchor = re.compile(r'<\/a>') clean_html = re_empty_anchor.sub(r'\1', clean_html)