Start using Lemmy's HTML 'content' field through an allowlist instead of translating Markdown

2025-01-23 19:36:56 -08:00 · 2024-08-25 15:58:38 +00:00 · 2024-08-25 15:58:38 +00:00 · 2069ca517e
commit 2069ca517e
parent b73b6fdcae
2 changed files with 61 additions and 41 deletions
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@ -846,13 +846,12 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post:
                    instance_id=user.instance_id,
                    indexable = user.indexable
                    )
-        if 'source' in post_json and \
-                post_json['source']['mediaType'] == 'text/markdown':
-            post.body = post_json['source']['content']
-            post.body_html = lemmy_markdown_to_html(post.body)
-        elif 'content' in post_json:
+        if 'content' in post_json:
            if post_json['mediaType'] == 'text/html':
                post.body_html = allowlist_html(post_json['content'])
+                if 'source' in post_json and post_json['source']['mediaType'] == 'text/markdown':
+                    post.body = post_json['source']['content']
+                else:
                    post.body = html_to_text(post.body_html)
            elif post_json['mediaType'] == 'text/markdown':
                post.body = post_json['content']
@ -1711,16 +1710,14 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep
                               ap_create_id=request_json['id'],
                               ap_announce_id=announce_id,
                               instance_id=user.instance_id)
-        # Get comment content. Lemmy puts this in unusual place.
-        if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
-                'mediaType' in request_json['object']['source'] and \
-                request_json['object']['source']['mediaType'] == 'text/markdown':
-            post_reply.body = request_json['object']['source']['content']
-            post_reply.body_html = lemmy_markdown_to_html(post_reply.body)
-        elif 'content' in request_json['object']:   # Kbin, Mastodon, etc provide their posts as html
+        if 'content' in request_json['object']:   # Kbin, Mastodon, etc provide their posts as html
            if not request_json['object']['content'].startswith('<p>') or not request_json['object']['content'].startswith('<blockquote>'):
                request_json['object']['content'] = '<p>' + request_json['object']['content'] + '</p>'
            post_reply.body_html = allowlist_html(request_json['object']['content'])
+            if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
+                    'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown':
+                post_reply.body = request_json['object']['source']['content']
+            else:
                post_reply.body = html_to_text(post_reply.body_html)
        # Language - Lemmy uses 'language' while Mastodon uses 'contentMap'
        if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict):
@ -1843,18 +1840,19 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json
                indexable=user.indexable,
                microblog=microblog
                )
-    # Get post content. Lemmy and Kbin put this in different places.
-    if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': # Lemmy
-        post.body = request_json['object']['source']['content']
-        post.body_html = lemmy_markdown_to_html(post.body)
-    elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
+    if 'content' in request_json['object'] and request_json['object']['content'] is not None:
        if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
            post.body_html = allowlist_html(request_json['object']['content'])
+            if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown':
+                post.body = request_json['object']['source']['content']
+            else:
                post.body = html_to_text(post.body_html)
        elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
            post.body = request_json['object']['content']
            post.body_html = markdown_to_html(post.body)
        else:
+            if not request_json['object']['content'].startswith('<p>') or not request_json['object']['content'].startswith('<blockquote>'):
+                request_json['object']['content'] = '<p>' + request_json['object']['content'] + '</p>'
            post.body_html = allowlist_html(request_json['object']['content'])
            post.body = html_to_text(post.body_html)
        if microblog:
@ -2070,14 +2068,15 @@ def notify_about_post_reply(parent_reply: Union[PostReply, None], new_reply: Pos


 def update_post_reply_from_activity(reply: PostReply, request_json: dict):
-    if 'source' in request_json['object'] and \
-            isinstance(request_json['object']['source'], dict) and \
-            request_json['object']['source']['mediaType'] == 'text/markdown':
-        reply.body = request_json['object']['source']['content']
-        reply.body_html = lemmy_markdown_to_html(reply.body)
-    elif 'content' in request_json['object']:
+    if 'content' in request_json['object']:   # Kbin, Mastodon, etc provide their posts as html
+        if not request_json['object']['content'].startswith('<p>') or not request_json['object']['content'].startswith('<blockquote>'):
+            request_json['object']['content'] = '<p>' + request_json['object']['content'] + '</p>'
        reply.body_html = allowlist_html(request_json['object']['content'])
-        reply.body = ''
+        if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
+            'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown':
+            reply.body = request_json['object']['source']['content']
+        else:
+            reply.body = html_to_text(post_reply.body_html)
    # Language
    if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict):
        language = find_language_or_create(request_json['object']['language']['identifier'], request_json['object']['language']['name'])
@ -2094,19 +2093,19 @@ def update_post_from_activity(post: Post, request_json: dict):

    nsfl_in_title = '[NSFL]' in name.upper() or '(NSFL)' in name.upper()
    post.title = name
-    if 'source' in request_json['object'] and \
-            isinstance(request_json['object']['source'], dict) and \
-            request_json['object']['source']['mediaType'] == 'text/markdown':
-        post.body = request_json['object']['source']['content']
-        post.body_html = lemmy_markdown_to_html(post.body)
-    elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
+    if 'content' in request_json['object'] and request_json['object']['content'] is not None:
        if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
            post.body_html = allowlist_html(request_json['object']['content'])
+            if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown':
+                post.body = request_json['object']['source']['content']
+            else:
                post.body = html_to_text(post.body_html)
        elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
            post.body = request_json['object']['content']
            post.body_html = markdown_to_html(post.body)
        else:
+            if not request_json['object']['content'].startswith('<p>') or not request_json['object']['content'].startswith('<blockquote>'):
+                request_json['object']['content'] = '<p>' + request_json['object']['content'] + '</p>'
            post.body_html = allowlist_html(request_json['object']['content'])
            post.body = html_to_text(post.body_html)
        if name == "[Microblog]":
--- a/app/utils.py
+++ b/app/utils.py
@ -270,9 +270,29 @@ def allowlist_html(html: str, a_target='_blank') -> str:
            if tag.name == 'table':
                tag.attrs['class'] = 'table'

+    clean_html = str(soup)
+
    # avoid returning empty anchors
    re_empty_anchor = re.compile(r'<a href="(.*?)" rel="nofollow ugc" target="_blank"><\/a>')
-    return re_empty_anchor.sub(r'<a href="\1" rel="nofollow ugc" target="_blank">\1</a>', str(soup))
+    clean_html = re_empty_anchor.sub(r'<a href="\1" rel="nofollow ugc" target="_blank">\1</a>', clean_html)
+
+    # replace lemmy's spoiler markdown left in HTML
+    re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|</p>)(.+?)(?:\n|<p>):{3}', re.S)
+    clean_html = re_spoiler.sub(r'<details><summary>\1</summary><p>\2</p></details>', clean_html)
+
+    # replace strikethough markdown left in HTML
+    re_strikethough = re.compile(r'~~(.*)~~')
+    clean_html = re_strikethough.sub(r'<s>\1</s>', clean_html)
+
+    # replace subscript markdown left in HTML
+    re_subscript = re.compile(r'~(.*)~')
+    clean_html = re_subscript.sub(r'<sub>\1</sub>', clean_html)
+
+    # replace superscript markdown left in HTML
+    re_superscript = re.compile(r'\^(.*)\^')
+    clean_html = re_superscript.sub(r'<sup>\1</sup>', clean_html)
+
+    return clean_html


 # this is for pyfedi's version of Markdown (differs from lemmy for: newlines for soft breaks, ...)
@ -280,23 +300,24 @@ def markdown_to_html(markdown_text, anchors_new_tab=True) -> str:
    if markdown_text:
        raw_html = markdown2.markdown(markdown_text, safe_mode=True,
                    extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True, 'breaks': {'on_newline': True, 'on_backslash': True}})
-        # support lemmy's spoiler format
-        re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|</p>)(.+?)(?:\n|<p>):{3}', re.S)
-        raw_html = re_spoiler.sub(r'<details><summary>\1</summary><p>\2</p></details>', raw_html)
        return allowlist_html(raw_html, a_target='_blank' if anchors_new_tab else '')
    else:
        return ''


+# Have started process of replacing this function, and just using Lemmy's HTML 'content' field, same as other platforms that only provide that.
+# Lemmy's MD supports line breaks as SPACE-SPACE-NEWLINE or SPACE-BACKSLASH-NEWLINE but Markdown2 can't support both: without the 'breaks'
+# extra, it doesn't translate SPACE-BACKSLASH-NEWLINE to <br />, but with it it doesn't translate SPACE-SPACE-NEWLINE to <br />
+
+# done so far: post bodies (backfilled), post bodies (create), post bodies (edit), replies (create), replies (edit)
+# not done yet: user profiles, community descriptions, chat messages, over-writing with 'banned' or 'deleted by author', replies from autotl;dr bot
+
 # this is for lemmy's version of Markdown (can be removed in future - when HTML from them filtered through an allow_list is used, instead of MD)
 def lemmy_markdown_to_html(markdown_text) -> str:
    if markdown_text:
        raw_html = markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True,
                                                                             'fenced-code-blocks': True, 'strike': True,
                                                                             'breaks': {'on_newline': False, 'on_backslash': True}})
-        # replace lemmy spoiler tokens with appropriate html tags instead.
-        re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|</p>)(.+?)(?:\n|<p>):{3}', re.S)
-        raw_html = re_spoiler.sub(r'<details><summary>\1</summary><p>\2</p></details>', raw_html)
        return allowlist_html(raw_html)
    else:
        return ''