post.body needs to have something in it because that field is used for the search index

2025-01-23 19:36:56 -08:00 · 2024-05-29 15:19:32 +12:00 · 2024-05-29 15:19:32 +12:00 · 57550f02b9
commit 57550f02b9
parent 33b8b65f1b
2 changed files with 16 additions and 7 deletions
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@ -30,7 +30,8 @@ from app.utils import get_request, allowlist_html, get_setting, ap_datetime, mar
    is_image_url, domain_from_url, gibberish, ensure_directory_exists, markdown_to_text, head_request, post_ranking, \
    shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \
    blocked_phrases, microblog_content_to_title, generate_image_from_video_url, is_video_url, reply_is_stupid, \
-    notification_subscribers, communities_banned_from, lemmy_markdown_to_html, actor_contains_blocked_words
+    notification_subscribers, communities_banned_from, lemmy_markdown_to_html, actor_contains_blocked_words, \
+    html_to_text


 def public_key():
@ -811,7 +812,7 @@ def post_json_to_model(activity_log, post_json, user, community) -> Post:
        elif 'content' in post_json:
            if post_json['mediaType'] == 'text/html':
                post.body_html = allowlist_html(post_json['content'])
-                post.body = ''
+                post.body = html_to_text(post.body_html)
            elif post_json['mediaType'] == 'text/markdown':
                post.body = post_json['content']
                post.body_html = markdown_to_html(post.body)
@ -1613,13 +1614,13 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json
    elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
        if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
            post.body_html = allowlist_html(request_json['object']['content'])
-            post.body = ''
+            post.body = html_to_text(post.body_html)
        elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
            post.body = request_json['object']['content']
            post.body_html = markdown_to_html(post.body)
        else:
            post.body_html = allowlist_html(request_json['object']['content'])
-            post.body = ''
+            post.body = html_to_text(post.body_html)
        if name == "[Microblog]":
            name += ' ' + microblog_content_to_title(post.body_html)
            if '[NSFL]' in name.upper() or '(NSFL)' in name.upper():
@ -1839,13 +1840,13 @@ def update_post_from_activity(post: Post, request_json: dict):
    elif 'content' in request_json['object'] and request_json['object']['content'] is not None: # Kbin
        if 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/html':
            post.body_html = allowlist_html(request_json['object']['content'])
-            post.body = ''
+            post.body = html_to_text(post.body_html)
        elif 'mediaType' in request_json['object'] and request_json['object']['mediaType'] == 'text/markdown':
            post.body = request_json['object']['content']
            post.body_html = markdown_to_html(post.body)
        else:
            post.body_html = allowlist_html(request_json['object']['content'])
-            post.body = ''
+            post.body = html_to_text(post.body_html)
        if name == "[Microblog]":
            name += ' ' + microblog_content_to_title(post.body_html)
            nsfl_in_title = '[NSFL]' in name.upper() or '(NSFL)' in name.upper()
--- a/app/cli.py
+++ b/app/cli.py
@ -25,7 +25,7 @@ from app.models import Settings, BannedInstances, Interest, Role, User, RolePerm
    utcnow, Site, Instance, File, Notification, Post, CommunityMember, NotificationSubscription, PostReply, Language, \
    Tag, InstanceRole
 from app.utils import file_get_contents, retrieve_block_list, blocked_domains, retrieve_peertube_block_list, \
-    shorten_string, get_request
+    shorten_string, get_request, html_to_text


 def register(app):
@ -274,6 +274,14 @@ def register(app):
                if filesize > 0 and num_content > 0:
                    print(f'{user.id},"{user.ap_id}",{filesize},{num_content}')

+    @app.cli.command("repair-search")
+    def repair_search():
+        with app.app_context():
+            for post in Post.query.filter(Post.body == '', Post.body_html != ''):
+                post.body = html_to_text(post.body_html)
+                db.session.commit()
+        print('Done')
+
    def list_files(directory):
        for root, dirs, files in os.walk(directory):
            for file in files: