From 50db96fb6c20964f7a1ffa8e47348b4badcccb16 Mon Sep 17 00:00:00 2001 From: rimu <3310831+rimu@users.noreply.github.com> Date: Wed, 27 Mar 2024 16:02:04 +1300 Subject: [PATCH] remove broken html_to_markdown function --- app/activitypub/routes.py | 2 +- app/activitypub/util.py | 18 +++++++++--------- app/community/util.py | 2 +- app/utils.py | 35 ----------------------------------- 4 files changed, 11 insertions(+), 46 deletions(-) diff --git a/app/activitypub/routes.py b/app/activitypub/routes.py index 6bd0a8cf..9118be54 100644 --- a/app/activitypub/routes.py +++ b/app/activitypub/routes.py @@ -19,7 +19,7 @@ from app.activitypub.util import public_key, users_total, active_half_year, acti upvote_post, activity_already_ingested, delete_post_or_comment, community_members, \ user_removed_from_remote_server, create_post, create_post_reply, update_post_reply_from_activity, \ update_post_from_activity, undo_vote, undo_downvote -from app.utils import gibberish, get_setting, is_image_url, allowlist_html, html_to_markdown, render_template, \ +from app.utils import gibberish, get_setting, is_image_url, allowlist_html, render_template, \ domain_from_url, markdown_to_html, community_membership, ap_datetime, markdown_to_text, ip_address, can_downvote, \ can_upvote, can_create_post, awaken_dormant_instance, shorten_string, can_create_post_reply, sha256_digest, \ community_moderators diff --git a/app/activitypub/util.py b/app/activitypub/util.py index ea44912a..7fee9627 100644 --- a/app/activitypub/util.py +++ b/app/activitypub/util.py @@ -22,7 +22,7 @@ from PIL import Image, ImageOps from io import BytesIO import pytesseract -from app.utils import get_request, allowlist_html, html_to_markdown, get_setting, ap_datetime, markdown_to_html, \ +from app.utils import get_request, allowlist_html, get_setting, ap_datetime, markdown_to_html, \ is_image_url, domain_from_url, gibberish, ensure_directory_exists, markdown_to_text, head_request, post_ranking, \ shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \ blocked_phrases @@ -434,7 +434,7 @@ def refresh_community_profile_task(community_id): community.description_html = markdown_to_html(community.description) elif 'content' in activity_json: community.description_html = allowlist_html(activity_json['content']) - community.description = html_to_markdown(community.description_html) + community.description = '' icon_changed = cover_changed = False if 'icon' in activity_json: @@ -585,7 +585,7 @@ def actor_json_to_model(activity_json, address, server): community.description_html = markdown_to_html(community.description) elif 'content' in activity_json: community.description_html = allowlist_html(activity_json['content']) - community.description = html_to_markdown(community.description_html) + community.description = '' if 'icon' in activity_json: icon = File(source_url=activity_json['icon']['url']) community.icon = icon @@ -625,7 +625,7 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post: post.body_html = markdown_to_html(post.body) elif 'content' in post_json: post.body_html = allowlist_html(post_json['content']) - post.body = html_to_markdown(post.body_html) + post.body = '' if 'attachment' in post_json and len(post_json['attachment']) > 0 and 'type' in post_json['attachment'][0]: if post_json['attachment'][0]['type'] == 'Link': post.url = post_json['attachment'][0]['href'] @@ -773,7 +773,7 @@ def parse_summary(user_json) -> str: if 'source' in user_json and user_json['source'].get('mediaType') == 'text/markdown': # Convert Markdown to HTML markdown_text = user_json['source']['content'] - html_content = html_to_markdown(markdown_text) + html_content = allowlist_html(markdown_to_html(markdown_text)) return html_content elif 'summary' in user_json: return allowlist_html(user_json['summary']) @@ -1184,7 +1184,7 @@ def create_post_reply(activity_log: ActivityPubLog, community: Community, in_rep post_reply.body_html = markdown_to_html(post_reply.body) elif 'content' in request_json['object']: # Kbin post_reply.body_html = allowlist_html(request_json['object']['content']) - post_reply.body = html_to_markdown(post_reply.body_html) + post_reply.body = '' if post_id is not None: # Discard post_reply if it contains certain phrases. Good for stopping spam floods. if post_reply.body: @@ -1283,7 +1283,7 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json post.body_html = markdown_to_html(post.body) elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin post.body_html = allowlist_html(request_json['object']['content']) - post.body = html_to_markdown(post.body_html) + post.body = '' # Discard post if it contains certain phrases. Good for stopping spam floods. blocked_phrases_list = blocked_phrases() for blocked_phrase in blocked_phrases_list: @@ -1375,7 +1375,7 @@ def update_post_reply_from_activity(reply: PostReply, request_json: dict): reply.body_html = markdown_to_html(reply.body) elif 'content' in request_json['object']: reply.body_html = allowlist_html(request_json['object']['content']) - reply.body = html_to_markdown(reply.body_html) + reply.body = '' reply.edited_at = utcnow() db.session.commit() @@ -1389,7 +1389,7 @@ def update_post_from_activity(post: Post, request_json: dict): post.body_html = markdown_to_html(post.body) elif 'content' in request_json['object']: post.body_html = allowlist_html(request_json['object']['content']) - post.body = html_to_markdown(post.body_html) + post.body = '' if 'attachment' in request_json['object'] and 'href' in request_json['object']['attachment']: post.url = request_json['object']['attachment']['href'] if 'sensitive' in request_json['object']: diff --git a/app/community/util.py b/app/community/util.py index 05e435ef..f58c067c 100644 --- a/app/community/util.py +++ b/app/community/util.py @@ -15,7 +15,7 @@ from app.constants import POST_TYPE_ARTICLE, POST_TYPE_LINK, POST_TYPE_IMAGE from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \ Instance, Notification, User, ActivityPubLog from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \ - html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page, \ + is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page, \ remove_tracking_from_link, ap_datetime, instance_banned, blocked_phrases from sqlalchemy import func, desc import os diff --git a/app/utils.py b/app/utils.py index fa556b6a..9e5f7735 100644 --- a/app/utils.py +++ b/app/utils.py @@ -214,41 +214,6 @@ def allowlist_html(html: str) -> str: return str(soup) -# convert basic HTML to Markdown -def html_to_markdown(html: str) -> str: - soup = BeautifulSoup(html, 'html.parser') - return html_to_markdown_worker(soup) - - -def html_to_markdown_worker(element, indent_level=0): - formatted_text = '' - for item in element.contents: - if isinstance(item, str): - formatted_text += item - elif item.name == 'p': - formatted_text += '\n\n' - elif item.name == 'br': - formatted_text += ' \n' # Double space at the end for line break - elif item.name == 'strong': - formatted_text += '**' + html_to_markdown_worker(item) + '**' - elif item.name == 'ul': - formatted_text += '\n' - formatted_text += html_to_markdown_worker(item, indent_level + 1) - formatted_text += '\n' - elif item.name == 'ol': - formatted_text += '\n' - formatted_text += html_to_markdown_worker(item, indent_level + 1) - formatted_text += '\n' - elif item.name == 'li': - bullet = '-' if item.find_parent(['ul', 'ol']) and item.find_previous_sibling() is None else '' - formatted_text += ' ' * indent_level + bullet + ' ' + html_to_markdown_worker(item).strip() + '\n' - elif item.name == 'blockquote': - formatted_text += ' ' * indent_level + '> ' + html_to_markdown_worker(item).strip() + '\n' - elif item.name == 'code': - formatted_text += '`' + html_to_markdown_worker(item) + '`' - return formatted_text - - def markdown_to_html(markdown_text) -> str: if markdown_text: return allowlist_html(markdown2.markdown(markdown_text, safe_mode=True, extras={'middle-word-em': False, 'tables': True, 'fenced-code-blocks': True, 'strike': True}))