improve thumbnail generation reliability

This commit is contained in:
rimu 2024-06-22 14:18:26 +08:00
parent 805fd7c5d4
commit 115247f422
4 changed files with 80 additions and 62 deletions

View file

@ -31,7 +31,7 @@ from app.utils import get_request, allowlist_html, get_setting, ap_datetime, mar
shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \
blocked_phrases, microblog_content_to_title, generate_image_from_video_url, is_video_url, reply_is_stupid, \
notification_subscribers, communities_banned_from, lemmy_markdown_to_html, actor_contains_blocked_words, \
html_to_text
html_to_text, opengraph_parse, url_to_thumbnail_file
def public_key():
@ -1734,6 +1734,16 @@ def create_post(activity_log: ActivityPubLog, community: Community, request_json
image = File(source_url=request_json['object']['image']['url'])
db.session.add(image)
post.image = image
if post.image is None and post.type == POST_TYPE_LINK: # This is a link post but the source instance has not provided a thumbnail image
# Let's see if we can do better than the source instance did!
opengraph = opengraph_parse(post.url)
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') or opengraph.get('og:image:url')
if not filename.startswith('/'):
file = File(source_url=filename, alt_text=shorten_string(opengraph.get('og:title'), 295))
post.image = file
db.session.add(file)
db.session.add(post)
post.ranking = post_ranking(post.score, post.posted_at)
community.post_count += 1

View file

@ -17,7 +17,7 @@ from app.community.forms import SearchRemoteCommunity, CreateDiscussionForm, Cre
DeleteCommunityForm, AddCommunityForm, EditCommunityForm, AddModeratorForm, BanUserCommunityForm, \
EscalateReportForm, ResolveReportForm, CreateVideoForm, CreatePollForm, RetrieveRemotePost
from app.community.util import search_for_community, actor_to_community, \
opengraph_parse, url_to_thumbnail_file, save_post, save_icon_file, save_banner_file, send_to_remote_instance, \
save_post, save_icon_file, save_banner_file, send_to_remote_instance, \
delete_post_from_community, delete_post_reply_from_community, community_in_list
from app.constants import SUBSCRIPTION_MEMBER, SUBSCRIPTION_OWNER, POST_TYPE_LINK, POST_TYPE_ARTICLE, POST_TYPE_IMAGE, \
SUBSCRIPTION_PENDING, SUBSCRIPTION_MODERATOR, REPORT_STATE_NEW, REPORT_STATE_ESCALATED, REPORT_STATE_RESOLVED, \

View file

@ -19,7 +19,7 @@ from app.models import Community, File, BannedInstances, PostReply, PostVote, Po
Instance, Notification, User, ActivityPubLog, NotificationSubscription, Language, Tag, PollChoice, Poll
from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page, \
remove_tracking_from_link, ap_datetime, instance_banned, blocked_phrases
remove_tracking_from_link, ap_datetime, instance_banned, blocked_phrases, url_to_thumbnail_file, opengraph_parse
from sqlalchemy import func, desc, text
import os
@ -242,52 +242,6 @@ def actor_to_community(actor) -> Community:
return community
def opengraph_parse(url):
if '?' in url:
url = url.split('?')
url = url[0]
try:
return parse_page(url)
except Exception as ex:
return None
def url_to_thumbnail_file(filename) -> File:
response = requests.get(filename, timeout=5)
if response.status_code == 200:
content_type = response.headers.get('content-type')
if content_type and content_type.startswith('image'):
# Generate file extension from mime type
content_type_parts = content_type.split('/')
if content_type_parts:
file_extension = '.' + content_type_parts[-1]
if file_extension == '.jpeg':
file_extension = '.jpg'
else:
file_extension = os.path.splitext(filename)[1]
file_extension = file_extension.replace('%3f', '?') # sometimes urls are not decoded properly
if '?' in file_extension:
file_extension = file_extension.split('?')[0]
new_filename = gibberish(15)
directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory)
final_place = os.path.join(directory, new_filename + file_extension)
with open(final_place, 'wb') as f:
f.write(response.content)
response.close()
Image.MAX_IMAGE_PIXELS = 89478485
with Image.open(final_place) as img:
img = ImageOps.exif_transpose(img)
img.thumbnail((150, 150))
img.save(final_place)
thumbnail_width = img.width
thumbnail_height = img.height
return File(file_name=new_filename + file_extension, thumbnail_width=thumbnail_width,
thumbnail_height=thumbnail_height, thumbnail_path=final_place,
source_url=filename)
def save_post(form, post: Post, type: str):
post.indexable = current_user.indexable
post.sticky = form.sticky.data
@ -318,6 +272,7 @@ def save_post(form, post: Post, type: str):
post.image_id = None
if post.url.endswith('.mp4') or post.url.endswith('.webm'):
post.type = POST_TYPE_VIDEO
file = File(source_url=form.link_url.data) # make_image_sizes() will take care of turning this into a still image
post.image = file
db.session.add(file)
@ -331,15 +286,16 @@ def save_post(form, post: Post, type: str):
post.type = POST_TYPE_IMAGE
else:
# check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
opengraph = opengraph_parse(form.link_url.data)
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') or opengraph.get('og:image:url')
if not filename.startswith('/'):
file = url_to_thumbnail_file(filename)
if file:
file.alt_text = shorten_string(opengraph.get('og:title'), 295)
post.image = file
db.session.add(file)
if not post.type == POST_TYPE_VIDEO:
opengraph = opengraph_parse(form.link_url.data)
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') or opengraph.get('og:image:url')
if not filename.startswith('/'):
file = url_to_thumbnail_file(filename)
if file:
file.alt_text = shorten_string(opengraph.get('og:title'), 295)
post.image = file
db.session.add(file)
elif type == 'image':
post.title = form.image_title.data

View file

@ -31,11 +31,12 @@ from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
from app import db, cache
import re
from moviepy.editor import VideoFileClip
from PIL import Image
from PIL import Image, ImageOps
from app.email import send_welcome_email
from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
Site, Post, PostReply, utcnow, Filter, CommunityMember, InstanceBlock, CommunityBan, Topic, UserBlock, Language
Site, Post, PostReply, utcnow, Filter, CommunityMember, InstanceBlock, CommunityBan, Topic, UserBlock, Language, \
File
# Flask's render_template function, with support for themes added
@ -89,7 +90,8 @@ def get_request(uri, params=None, headers=None) -> requests.Response:
else:
payload_str = urllib.parse.urlencode(params) if params else None
try:
response = requests.get(uri, params=payload_str, headers=headers, timeout=5, allow_redirects=True)
timeout = 15 if 'washingtonpost.com' in uri else 5 # Washington Post is really slow on og:image for some reason
response = requests.get(uri, params=payload_str, headers=headers, timeout=timeout, allow_redirects=True)
except requests.exceptions.SSLError as invalid_cert:
# Not our problem if the other end doesn't have proper SSL
current_app.logger.info(f"{uri} {invalid_cert}")
@ -851,6 +853,56 @@ def confidence(ups, downs) -> float:
return _confidence(ups, downs)
def opengraph_parse(url):
if '?' in url:
url = url.split('?')
url = url[0]
try:
return parse_page(url)
except Exception as ex:
return None
def url_to_thumbnail_file(filename) -> File:
try:
timeout = 15 if 'washingtonpost.com' in filename else 5 # Washington Post is really slow for some reason
response = requests.get(filename, timeout=timeout)
except:
return None
if response.status_code == 200:
content_type = response.headers.get('content-type')
if content_type and content_type.startswith('image'):
# Generate file extension from mime type
content_type_parts = content_type.split('/')
if content_type_parts:
file_extension = '.' + content_type_parts[-1]
if file_extension == '.jpeg':
file_extension = '.jpg'
else:
file_extension = os.path.splitext(filename)[1]
file_extension = file_extension.replace('%3f', '?') # sometimes urls are not decoded properly
if '?' in file_extension:
file_extension = file_extension.split('?')[0]
new_filename = gibberish(15)
directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory)
final_place = os.path.join(directory, new_filename + file_extension)
with open(final_place, 'wb') as f:
f.write(response.content)
response.close()
Image.MAX_IMAGE_PIXELS = 89478485
with Image.open(final_place) as img:
img = ImageOps.exif_transpose(img)
img.thumbnail((150, 150))
img.save(final_place)
thumbnail_width = img.width
thumbnail_height = img.height
return File(file_name=new_filename + file_extension, thumbnail_width=thumbnail_width,
thumbnail_height=thumbnail_height, thumbnail_path=final_place,
source_url=filename)
# By no means is this a complete list, but it is very easy to search for the ones you need later.
KNOWN_OPENGRAPH_TAGS = [
"og:site_name",
@ -980,7 +1032,7 @@ def in_sorted_list(arr, target):
# Makes a still image from a video url, without downloading the whole video file
def generate_image_from_video_url(video_url, output_path, length=2):
response = requests.get(video_url, stream=True)
response = requests.get(video_url, stream=True, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0'}) # Imgur requires a user agent
content_type = response.headers.get('Content-Type')
if content_type:
if 'video/mp4' in content_type: