From 2069ca517ecdc0a478c64bd967f02272e61bdf7e Mon Sep 17 00:00:00 2001
From: freamon
Date: Sun, 25 Aug 2024 15:58:38 +0000
Subject: [PATCH] Start using Lemmy's HTML 'content' field through an allowlist
instead of translating Markdown
---
app/activitypub/util.py | 67 ++++++++++++++++++++---------------------
app/utils.py | 35 ++++++++++++++++-----
2 files changed, 61 insertions(+), 41 deletions(-)
diff --git a/app/activitypub/util.py b/app/activitypub/util.py
index f884cd78..723a9030 100644
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@@ -846,14 +846,13 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post:
instance_id=user.instance_id,
indexable = user.indexable
)
- if 'source' in post_json and \
- post_json['source']['mediaType'] == 'text/markdown':
- post.body = post_json['source']['content']
- post.body_html = lemmy_markdown_to_html(post.body)
- elif 'content' in post_json:
+ if 'content' in post_json:
if post_json['mediaType'] == 'text/html':
post.body_html = allowlist_html(post_json['content'])
- post.body = html_to_text(post.body_html)
+ if 'source' in post_json and post_json['source']['mediaType'] == 'text/markdown':
+ post.body = post_json['source']['content']
+ else:
+ post.body = html_to_text(post.body_html)
elif post_json['mediaType'] == 'text/markdown':
post.body = post_json['content']
post.body_html = markdown_to_html(post.body)
@@ -1711,17 +1710,15 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep
ap_create_id=request_json['id'],
ap_announce_id=announce_id,
instance_id=user.instance_id)
- # Get comment content. Lemmy puts this in unusual place.
- if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
- 'mediaType' in request_json['object']['source'] and \
- request_json['object']['source']['mediaType'] == 'text/markdown':
- post_reply.body = request_json['object']['source']['content']
- post_reply.body_html = lemmy_markdown_to_html(post_reply.body)
- elif 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html
+ if 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html
if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'):
request_json['object']['content'] = '' + request_json['object']['content'] + '
'
post_reply.body_html = allowlist_html(request_json['object']['content'])
- post_reply.body = html_to_text(post_reply.body_html)
+ if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
+ 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown':
+ post_reply.body = request_json['object']['source']['content']
+ else:
+ post_reply.body = html_to_text(post_reply.body_html)
# Language - Lemmy uses 'language' while Mastodon uses 'contentMap'
if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict):
language = find_language_or_create(request_json['object']['language']['identifier'],
@@ -1843,18 +1840,19 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json
indexable=user.indexable,
microblog=microblog
)
- # Get post content. Lemmy and Kbin put this in different places.
- if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown': # Lemmy
- post.body = request_json['object']['source']['content']
- post.body_html = lemmy_markdown_to_html(post.body)
- elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
+ if 'content' in request_json['object'] and request_json['object']['content'] is not None:
if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
post.body_html = allowlist_html(request_json['object']['content'])
- post.body = html_to_text(post.body_html)
+ if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown':
+ post.body = request_json['object']['source']['content']
+ else:
+ post.body = html_to_text(post.body_html)
elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
post.body = request_json['object']['content']
post.body_html = markdown_to_html(post.body)
else:
+ if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'):
+ request_json['object']['content'] = '' + request_json['object']['content'] + '
'
post.body_html = allowlist_html(request_json['object']['content'])
post.body = html_to_text(post.body_html)
if microblog:
@@ -2070,14 +2068,15 @@ def notify_about_post_reply(parent_reply: Union[PostReply, None], new_reply: Pos
def update_post_reply_from_activity(reply: PostReply, request_json: dict):
- if 'source' in request_json['object'] and \
- isinstance(request_json['object']['source'], dict) and \
- request_json['object']['source']['mediaType'] == 'text/markdown':
- reply.body = request_json['object']['source']['content']
- reply.body_html = lemmy_markdown_to_html(reply.body)
- elif 'content' in request_json['object']:
+ if 'content' in request_json['object']: # Kbin, Mastodon, etc provide their posts as html
+ if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'):
+ request_json['object']['content'] = '' + request_json['object']['content'] + '
'
reply.body_html = allowlist_html(request_json['object']['content'])
- reply.body = ''
+ if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and \
+ 'mediaType' in request_json['object']['source'] and request_json['object']['source']['mediaType'] == 'text/markdown':
+ reply.body = request_json['object']['source']['content']
+ else:
+ reply.body = html_to_text(post_reply.body_html)
# Language
if 'language' in request_json['object'] and isinstance(request_json['object']['language'], dict):
language = find_language_or_create(request_json['object']['language']['identifier'], request_json['object']['language']['name'])
@@ -2094,19 +2093,19 @@ def update_post_from_activity(post: Post, request_json: dict):
nsfl_in_title = '[NSFL]' in name.upper() or '(NSFL)' in name.upper()
post.title = name
- if 'source' in request_json['object'] and \
- isinstance(request_json['object']['source'], dict) and \
- request_json['object']['source']['mediaType'] == 'text/markdown':
- post.body = request_json['object']['source']['content']
- post.body_html = lemmy_markdown_to_html(post.body)
- elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
+ if 'content' in request_json['object'] and request_json['object']['content'] is not None:
if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
post.body_html = allowlist_html(request_json['object']['content'])
- post.body = html_to_text(post.body_html)
+ if 'source' in request_json['object'] and isinstance(request_json['object']['source'], dict) and request_json['object']['source']['mediaType'] == 'text/markdown':
+ post.body = request_json['object']['source']['content']
+ else:
+ post.body = html_to_text(post.body_html)
elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
post.body = request_json['object']['content']
post.body_html = markdown_to_html(post.body)
else:
+ if not request_json['object']['content'].startswith('') or not request_json['object']['content'].startswith('
'):
+ request_json['object']['content'] = '' + request_json['object']['content'] + '
'
post.body_html = allowlist_html(request_json['object']['content'])
post.body = html_to_text(post.body_html)
if name == "[Microblog]":
diff --git a/app/utils.py b/app/utils.py
index 65cd3bc3..d73efe3e 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -270,9 +270,29 @@ def allowlist_html(html: str, a_target='_blank') -> str:
if tag.name == 'table':
tag.attrs['class'] = 'table'
+ clean_html = str(soup)
+
# avoid returning empty anchors
re_empty_anchor = re.compile(r'<\/a>')
- return re_empty_anchor.sub(r'\1', str(soup))
+ clean_html = re_empty_anchor.sub(r'\1', clean_html)
+
+ # replace lemmy's spoiler markdown left in HTML
+ re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|
)(.+?)(?:\n|):{3}', re.S)
+ clean_html = re_spoiler.sub(r'\1
\2
', clean_html)
+
+ # replace strikethough markdown left in HTML
+ re_strikethough = re.compile(r'~~(.*)~~')
+ clean_html = re_strikethough.sub(r'\1', clean_html)
+
+ # replace subscript markdown left in HTML
+ re_subscript = re.compile(r'~(.*)~')
+ clean_html = re_subscript.sub(r'\1', clean_html)
+
+ # replace superscript markdown left in HTML
+ re_superscript = re.compile(r'\^(.*)\^')
+ clean_html = re_superscript.sub(r'\1', clean_html)
+
+ return clean_html
# this is for pyfedi's version of Markdown (differs from lemmy for: newlines for soft breaks, ...)
@@ -280,23 +300,24 @@ def markdown_to_html(markdown_text, anchors_new_tab=True) -> str:
if markdown_text:
raw_html = markdown2.markdown(markdown_text, safe_mode=True,
extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True, 'breaks': {'on_newline': True, 'on_backslash': True}})
- # support lemmy's spoiler format
- re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|
)(.+?)(?:\n|):{3}', re.S)
- raw_html = re_spoiler.sub(r'\1
\2
', raw_html)
return allowlist_html(raw_html, a_target='_blank' if anchors_new_tab else '')
else:
return ''
+# Have started process of replacing this function, and just using Lemmy's HTML 'content' field, same as other platforms that only provide that.
+# Lemmy's MD supports line breaks as SPACE-SPACE-NEWLINE or SPACE-BACKSLASH-NEWLINE but Markdown2 can't support both: without the 'breaks'
+# extra, it doesn't translate SPACE-BACKSLASH-NEWLINE to
, but with it it doesn't translate SPACE-SPACE-NEWLINE to
+
+# done so far: post bodies (backfilled), post bodies (create), post bodies (edit), replies (create), replies (edit)
+# not done yet: user profiles, community descriptions, chat messages, over-writing with 'banned' or 'deleted by author', replies from autotl;dr bot
+
# this is for lemmy's version of Markdown (can be removed in future - when HTML from them filtered through an allow_list is used, instead of MD)
def lemmy_markdown_to_html(markdown_text) -> str:
if markdown_text:
raw_html = markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True,
'fenced-code-blocks': True, 'strike': True,
'breaks': {'on_newline': False, 'on_backslash': True}})
- # replace lemmy spoiler tokens with appropriate html tags instead.
- re_spoiler = re.compile(r':{3}\s*?spoiler\s+?(\S.+?)(?:\n|
)(.+?)(?:\n|):{3}', re.S)
- raw_html = re_spoiler.sub(r'\1
\2
', raw_html)
return allowlist_html(raw_html)
else:
return ''