improve thumbnail generation reliability

2025-02-02 16:21:32 -08:00 · 2024-06-22 14:18:26 +08:00 · 2024-06-22 14:18:26 +08:00 · 115247f422
commit 115247f422
parent 805fd7c5d4
4 changed files with 80 additions and 62 deletions
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@ -31,7 +31,7 @@ from app.utils import get_request, allowlist_html, get_setting, ap_datetime, mar
    shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \
    blocked_phrases, microblog_content_to_title, generate_image_from_video_url, is_video_url, reply_is_stupid, \
    notification_subscribers, communities_banned_from, lemmy_markdown_to_html, actor_contains_blocked_words, \
-    html_to_text
+    html_to_text, opengraph_parse, url_to_thumbnail_file


 def public_key():
@ -1734,6 +1734,16 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json
            image = File(source_url=request_json['object']['image']['url'])
            db.session.add(image)
            post.image = image
+        if post.image is None and post.type == POST_TYPE_LINK: # This is a link post but the source instance has not provided a thumbnail image
+            # Let's see if we can do better than the source instance did!
+            opengraph = opengraph_parse(post.url)
+            if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
+                filename = opengraph.get('og:image') or opengraph.get('og:image:url')
+                if not filename.startswith('/'):
+                    file = File(source_url=filename, alt_text=shorten_string(opengraph.get('og:title'), 295))
+                    post.image = file
+                    db.session.add(file)
+
        db.session.add(post)
        post.ranking = post_ranking(post.score, post.posted_at)
        community.post_count += 1
--- a/app/community/routes.py
+++ b/app/community/routes.py
@ -17,7 +17,7 @@ from app.community.forms import SearchRemoteCommunity, CreateDiscussionForm, Cre
    DeleteCommunityForm, AddCommunityForm, EditCommunityForm, AddModeratorForm, BanUserCommunityForm, \
    EscalateReportForm, ResolveReportForm, CreateVideoForm, CreatePollForm, RetrieveRemotePost
 from app.community.util import search_for_community, actor_to_community, \
-    opengraph_parse, url_to_thumbnail_file, save_post, save_icon_file, save_banner_file, send_to_remote_instance, \
+    save_post, save_icon_file, save_banner_file, send_to_remote_instance, \
    delete_post_from_community, delete_post_reply_from_community, community_in_list
 from app.constants import SUBSCRIPTION_MEMBER, SUBSCRIPTION_OWNER, POST_TYPE_LINK, POST_TYPE_ARTICLE, POST_TYPE_IMAGE, \
    SUBSCRIPTION_PENDING, SUBSCRIPTION_MODERATOR, REPORT_STATE_NEW, REPORT_STATE_ESCALATED, REPORT_STATE_RESOLVED, \
--- a/app/community/util.py
+++ b/app/community/util.py
@ -19,7 +19,7 @@ from app.models import Community, File, BannedInstances, PostReply, PostVote, Po
    Instance, Notification, User, ActivityPubLog, NotificationSubscription, Language, Tag, PollChoice, Poll
 from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
    is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page, \
-    remove_tracking_from_link, ap_datetime, instance_banned, blocked_phrases
+    remove_tracking_from_link, ap_datetime, instance_banned, blocked_phrases, url_to_thumbnail_file, opengraph_parse
 from sqlalchemy import func, desc, text
 import os

@ -242,52 +242,6 @@ def actor_to_community(actor) -> Community:
    return community


-def opengraph_parse(url):
-    if '?' in url:
-        url = url.split('?')
-        url = url[0]
-    try:
-        return parse_page(url)
-    except Exception as ex:
-        return None
-
-
-def url_to_thumbnail_file(filename) -> File:
-    response = requests.get(filename, timeout=5)
-    if response.status_code == 200:
-        content_type = response.headers.get('content-type')
-        if content_type and content_type.startswith('image'):
-            # Generate file extension from mime type
-            content_type_parts = content_type.split('/')
-            if content_type_parts:
-                file_extension = '.' + content_type_parts[-1]
-                if file_extension == '.jpeg':
-                    file_extension = '.jpg'
-            else:
-                file_extension = os.path.splitext(filename)[1]
-                file_extension = file_extension.replace('%3f', '?')  # sometimes urls are not decoded properly
-                if '?' in file_extension:
-                    file_extension = file_extension.split('?')[0]
-
-            new_filename = gibberish(15)
-            directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4]
-            ensure_directory_exists(directory)
-            final_place = os.path.join(directory, new_filename + file_extension)
-            with open(final_place, 'wb') as f:
-                f.write(response.content)
-            response.close()
-            Image.MAX_IMAGE_PIXELS = 89478485
-            with Image.open(final_place) as img:
-                img = ImageOps.exif_transpose(img)
-                img.thumbnail((150, 150))
-                img.save(final_place)
-                thumbnail_width = img.width
-                thumbnail_height = img.height
-            return File(file_name=new_filename + file_extension, thumbnail_width=thumbnail_width,
-                        thumbnail_height=thumbnail_height, thumbnail_path=final_place,
-                        source_url=filename)
-
-
 def save_post(form, post: Post, type: str):
    post.indexable = current_user.indexable
    post.sticky = form.sticky.data
@ -318,6 +272,7 @@ def save_post(form, post: Post, type: str):
                post.image_id = None

            if post.url.endswith('.mp4') or post.url.endswith('.webm'):
+                post.type = POST_TYPE_VIDEO
                file = File(source_url=form.link_url.data)  # make_image_sizes() will take care of turning this into a still image
                post.image = file
                db.session.add(file)
@ -331,15 +286,16 @@ def save_post(form, post: Post, type: str):
                    post.type = POST_TYPE_IMAGE
                else:
                    # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
-                    opengraph = opengraph_parse(form.link_url.data)
-                    if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
-                        filename = opengraph.get('og:image') or opengraph.get('og:image:url')
-                        if not filename.startswith('/'):
-                            file = url_to_thumbnail_file(filename)
-                            if file:
-                                file.alt_text = shorten_string(opengraph.get('og:title'), 295)
-                                post.image = file
-                                db.session.add(file)
+                    if not post.type == POST_TYPE_VIDEO:
+                        opengraph = opengraph_parse(form.link_url.data)
+                        if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
+                            filename = opengraph.get('og:image') or opengraph.get('og:image:url')
+                            if not filename.startswith('/'):
+                                file = url_to_thumbnail_file(filename)
+                                if file:
+                                    file.alt_text = shorten_string(opengraph.get('og:title'), 295)
+                                    post.image = file
+                                    db.session.add(file)

    elif type == 'image':
        post.title = form.image_title.data
--- a/app/utils.py
+++ b/app/utils.py
@ -31,11 +31,12 @@ from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
 from app import db, cache
 import re
 from moviepy.editor import VideoFileClip
-from PIL import Image
+from PIL import Image, ImageOps

 from app.email import send_welcome_email
 from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
-    Site, Post, PostReply, utcnow, Filter, CommunityMember, InstanceBlock, CommunityBan, Topic, UserBlock, Language
+    Site, Post, PostReply, utcnow, Filter, CommunityMember, InstanceBlock, CommunityBan, Topic, UserBlock, Language, \
+    File


 # Flask's render_template function, with support for themes added
@ -89,7 +90,8 @@ def get_request(uri, params=None, headers=None) -> requests.Response:
    else:
        payload_str = urllib.parse.urlencode(params) if params else None
    try:
-        response = requests.get(uri, params=payload_str, headers=headers, timeout=5, allow_redirects=True)
+        timeout = 15 if 'washingtonpost.com' in uri else 5  # Washington Post is really slow on og:image for some reason
+        response = requests.get(uri, params=payload_str, headers=headers, timeout=timeout, allow_redirects=True)
    except requests.exceptions.SSLError as invalid_cert:
        # Not our problem if the other end doesn't have proper SSL
        current_app.logger.info(f"{uri} {invalid_cert}")
@ -851,6 +853,56 @@ def confidence(ups, downs) -> float:
        return _confidence(ups, downs)


+def opengraph_parse(url):
+    if '?' in url:
+        url = url.split('?')
+        url = url[0]
+    try:
+        return parse_page(url)
+    except Exception as ex:
+        return None
+
+
+def url_to_thumbnail_file(filename) -> File:
+    try:
+        timeout = 15 if 'washingtonpost.com' in filename else 5 # Washington Post is really slow for some reason
+        response = requests.get(filename, timeout=timeout)
+    except:
+        return None
+    if response.status_code == 200:
+        content_type = response.headers.get('content-type')
+        if content_type and content_type.startswith('image'):
+            # Generate file extension from mime type
+            content_type_parts = content_type.split('/')
+            if content_type_parts:
+                file_extension = '.' + content_type_parts[-1]
+                if file_extension == '.jpeg':
+                    file_extension = '.jpg'
+            else:
+                file_extension = os.path.splitext(filename)[1]
+                file_extension = file_extension.replace('%3f', '?')  # sometimes urls are not decoded properly
+                if '?' in file_extension:
+                    file_extension = file_extension.split('?')[0]
+
+            new_filename = gibberish(15)
+            directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4]
+            ensure_directory_exists(directory)
+            final_place = os.path.join(directory, new_filename + file_extension)
+            with open(final_place, 'wb') as f:
+                f.write(response.content)
+            response.close()
+            Image.MAX_IMAGE_PIXELS = 89478485
+            with Image.open(final_place) as img:
+                img = ImageOps.exif_transpose(img)
+                img.thumbnail((150, 150))
+                img.save(final_place)
+                thumbnail_width = img.width
+                thumbnail_height = img.height
+            return File(file_name=new_filename + file_extension, thumbnail_width=thumbnail_width,
+                        thumbnail_height=thumbnail_height, thumbnail_path=final_place,
+                        source_url=filename)
+
+
 # By no means is this a complete list, but it is very easy to search for the ones you need later.
 KNOWN_OPENGRAPH_TAGS = [
    "og:site_name",
@ -980,7 +1032,7 @@ def in_sorted_list(arr, target):
 # Makes a still image from a video url, without downloading the whole video file
 def generate_image_from_video_url(video_url, output_path, length=2):

-    response = requests.get(video_url, stream=True)
+    response = requests.get(video_url, stream=True, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0'})  # Imgur requires a user agent
    content_type = response.headers.get('Content-Type')
    if content_type:
        if 'video/mp4' in content_type: