diff --git a/app/activitypub/util.py b/app/activitypub/util.py index f884cd78..723a9030 100644 --- a/app/activitypub/util.py +++ b/app/activitypub/util.py @@ -846,14 +846,13 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post: instance_id=user.instance_id, indexable = user.indexable ) - if 'source' in post_json and \ - post_json['source']['mediaType'] == 'text/markdown': - post.body = post_json['source']['content'] - post.body_html = lemmy_markdown_to_html(post.body) - elif 'content' in post_json: + if 'content' in post_json: if post_json['mediaType'] == 'text/html': post.body_html = allowlist_html(post_json['content']) - post.body = html_to_text(post.body_html) + if 'source' in post_json and post_json['source']['mediaType'] == 'text/markdown': + post.body = post_json['source']['content'] + else: + post.body = html_to_text(post.body_html) elif post_json['mediaType'] == 'text/markdown': post.body = post_json['content'] post.body_html = markdown_to_html(post.body) @@ -1711,17 +1710,15 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep ap_create_id=request_json['id'], ap_announce_id=announce_id, instance_id=user.instance_id) - # Get comment content. Lemmy puts this in unusual place. - if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \ - 'mediaType' in request_json['object']['source'] and \ - request_json['object']['source']['mediaType'] == 'text/markdown': - post_reply.body = request_json['object']['source']['content'] - post_reply.body_html = lemmy_markdown_to_html(post_reply.body) - elif 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html + if 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html if not request_json['object']['content'].startswith('
') or not request_json['object']['content'].startswith('
'): request_json['object']['content'] = '' + request_json['object']['content'] + '
' post_reply.body_html = allowlist_html(request_json['object']['content']) - post_reply.body = html_to_text(post_reply.body_html) + if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \ + 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown': + post_reply.body = request_json['object']['source']['content'] + else: + post_reply.body = html_to_text(post_reply.body_html) # Language - Lemmy uses 'language' while Mastodon uses 'contentMap' if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict): language = find_language_or_create(request_json['object']['language']['identifier'], @@ -1843,18 +1840,19 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json indexable=user.indexable, microblog=microblog ) - # Get post content. Lemmy and Kbin put this in different places. - if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': # Lemmy - post.body = request_json['object']['source']['content'] - post.body_html = lemmy_markdown_to_html(post.body) - elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin + if 'content' in request_json['object'] and request_json['object']['content'] is not None: if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html': post.body_html = allowlist_html(request_json['object']['content']) - post.body = html_to_text(post.body_html) + if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': + post.body = request_json['object']['source']['content'] + else: + post.body = html_to_text(post.body_html) elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown': post.body = request_json['object']['content'] post.body_html = markdown_to_html(post.body) else: + if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'): + request_json['object']['content'] = '' + request_json['object']['content'] + '
' post.body_html = allowlist_html(request_json['object']['content']) post.body = html_to_text(post.body_html) if microblog: @@ -2070,14 +2068,15 @@ def notify_about_post_reply(parent_reply: Union[PostReply, None], new_reply: Pos def update_post_reply_from_activity(reply: PostReply, request_json: dict): - if 'source' in request_json['object'] and \ - isinstance(request_json['object']['source'], dict) and \ - request_json['object']['source']['mediaType'] == 'text/markdown': - reply.body = request_json['object']['source']['content'] - reply.body_html = lemmy_markdown_to_html(reply.body) - elif 'content' in request_json['object']: + if 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html + if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'): + request_json['object']['content'] = '' + request_json['object']['content'] + '
' reply.body_html = allowlist_html(request_json['object']['content']) - reply.body = '' + if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \ + 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown': + reply.body = request_json['object']['source']['content'] + else: + reply.body = html_to_text(post_reply.body_html) # Language if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict): language = find_language_or_create(request_json['object']['language']['identifier'], request_json['object']['language']['name']) @@ -2094,19 +2093,19 @@ def update_post_from_activity(post: Post, request_json: dict): nsfl_in_title = '[NSFL]' in name.upper() or '(NSFL)' in name.upper() post.title = name - if 'source' in request_json['object'] and \ - isinstance(request_json['object']['source'], dict) and \ - request_json['object']['source']['mediaType'] == 'text/markdown': - post.body = request_json['object']['source']['content'] - post.body_html = lemmy_markdown_to_html(post.body) - elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin + if 'content' in request_json['object'] and request_json['object']['content'] is not None: if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html': post.body_html = allowlist_html(request_json['object']['content']) - post.body = html_to_text(post.body_html) + if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': + post.body = request_json['object']['source']['content'] + else: + post.body = html_to_text(post.body_html) elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown': post.body = request_json['object']['content'] post.body_html = markdown_to_html(post.body) else: + if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'): + request_json['object']['content'] = '' + request_json['object']['content'] + '
' post.body_html = allowlist_html(request_json['object']['content']) post.body = html_to_text(post.body_html) if name == "[Microblog]": diff --git a/app/utils.py b/app/utils.py index 65cd3bc3..d73efe3e 100644 --- a/app/utils.py +++ b/app/utils.py @@ -270,9 +270,29 @@ def allowlist_html(html: str, a_target='_blank') -> str: if tag.name == 'table': tag.attrs['class'] = 'table' + clean_html = str(soup) + # avoid returning empty anchors re_empty_anchor = re.compile(r'<\/a>') - return re_empty_anchor.sub(r'\1', str(soup)) + clean_html = re_empty_anchor.sub(r'\1', clean_html) + + # replace lemmy's spoiler markdown left in HTML + re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|)(.+?)(?:\n|):{3}', re.S) + clean_html = re_spoiler.sub(r'
)(.+?)(?:\n|', clean_html) + + # replace strikethough markdown left in HTML + re_strikethough = re.compile(r'~~(.*)~~') + clean_html = re_strikethough.sub(r'\1
\2
\1', clean_html) + + # replace subscript markdown left in HTML + re_subscript = re.compile(r'~(.*)~') + clean_html = re_subscript.sub(r'\1', clean_html) + + # replace superscript markdown left in HTML + re_superscript = re.compile(r'\^(.*)\^') + clean_html = re_superscript.sub(r'\1', clean_html) + + return clean_html # this is for pyfedi's version of Markdown (differs from lemmy for: newlines for soft breaks, ...) @@ -280,23 +300,24 @@ def markdown_to_html(markdown_text, anchors_new_tab=True) -> str: if markdown_text: raw_html = markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True, 'breaks': {'on_newline': True, 'on_backslash': True}}) - # support lemmy's spoiler format - re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|):{3}', re.S) - raw_html = re_spoiler.sub(r'
)(.+?)(?:\n|', raw_html) return allowlist_html(raw_html, a_target='_blank' if anchors_new_tab else '') else: return '' +# Have started process of replacing this function, and just using Lemmy's HTML 'content' field, same as other platforms that only provide that. +# Lemmy's MD supports line breaks as SPACE-SPACE-NEWLINE or SPACE-BACKSLASH-NEWLINE but Markdown2 can't support both: without the 'breaks' +# extra, it doesn't translate SPACE-BACKSLASH-NEWLINE to\1
\2
, but with it it doesn't translate SPACE-SPACE-NEWLINE to
+ +# done so far: post bodies (backfilled), post bodies (create), post bodies (edit), replies (create), replies (edit) +# not done yet: user profiles, community descriptions, chat messages, over-writing with 'banned' or 'deleted by author', replies from autotl;dr bot + # this is for lemmy's version of Markdown (can be removed in future - when HTML from them filtered through an allow_list is used, instead of MD) def lemmy_markdown_to_html(markdown_text) -> str: if markdown_text: raw_html = markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True, 'breaks': {'on_newline': False, 'on_backslash': True}}) - # replace lemmy spoiler tokens with appropriate html tags instead. - re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|):{3}', re.S) - raw_html = re_spoiler.sub(r'
', raw_html) return allowlist_html(raw_html) else: return ''\1
\2