better image handling - use content-type header instead of extension #fixes 37

This commit is contained in:
rimu 2024-06-19 13:46:36 +08:00
parent cce25e7a54
commit e2ce7c832f
2 changed files with 64 additions and 31 deletions

View file

@ -253,10 +253,22 @@ def opengraph_parse(url):
def url_to_thumbnail_file(filename) -> File: def url_to_thumbnail_file(filename) -> File:
filename_for_extension = filename.split('?')[0] if '?' in filename else filename
unused, file_extension = os.path.splitext(filename_for_extension)
response = requests.get(filename, timeout=5) response = requests.get(filename, timeout=5)
if response.status_code == 200: if response.status_code == 200:
content_type = response.headers.get('content-type')
if content_type and content_type.startswith('image'):
# Generate file extension from mime type
content_type_parts = content_type.split('/')
if content_type_parts:
file_extension = '.' + content_type_parts[-1]
if file_extension == '.jpeg':
file_extension = '.jpg'
else:
file_extension = os.path.splitext(filename)[1]
file_extension = file_extension.replace('%3f', '?') # sometimes urls are not decoded properly
if '?' in file_extension:
file_extension = file_extension.split('?')[0]
new_filename = gibberish(15) new_filename = gibberish(15)
directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4] directory = 'app/static/media/posts/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory) ensure_directory_exists(directory)
@ -322,9 +334,7 @@ def save_post(form, post: Post, type: str):
opengraph = opengraph_parse(form.link_url.data) opengraph = opengraph_parse(form.link_url.data)
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''): if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') or opengraph.get('og:image:url') filename = opengraph.get('og:image') or opengraph.get('og:image:url')
filename_for_extension = filename.split('?')[0] if '?' in filename else filename if not filename.startswith('/'):
unused, file_extension = os.path.splitext(filename_for_extension)
if file_extension.lower() in allowed_extensions and not filename.startswith('/'):
file = url_to_thumbnail_file(filename) file = url_to_thumbnail_file(filename)
if file: if file:
file.alt_text = shorten_string(opengraph.get('og:title'), 295) file.alt_text = shorten_string(opengraph.get('og:title'), 295)
@ -415,9 +425,7 @@ def save_post(form, post: Post, type: str):
opengraph = opengraph_parse(form.video_url.data) opengraph = opengraph_parse(form.video_url.data)
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''): if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') or opengraph.get('og:image:url') filename = opengraph.get('og:image') or opengraph.get('og:image:url')
filename_for_extension = filename.split('?')[0] if '?' in filename else filename if not filename.startswith('/'):
unused, file_extension = os.path.splitext(filename_for_extension)
if file_extension.lower() in allowed_extensions and not filename.startswith('/'):
file = url_to_thumbnail_file(filename) file = url_to_thumbnail_file(filename)
if file: if file:
file.alt_text = shorten_string(opengraph.get('og:title'), 295) file.alt_text = shorten_string(opengraph.get('og:title'), 295)

View file

@ -165,19 +165,44 @@ def gibberish(length: int = 10) -> str:
def is_image_url(url): def is_image_url(url):
common_image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']
mime_type = mime_type_using_head(url)
if mime_type:
mime_type_parts = mime_type.split('/')
return f'.{mime_type_parts[1]}' in common_image_extensions
else:
parsed_url = urlparse(url) parsed_url = urlparse(url)
path = parsed_url.path.lower() path = parsed_url.path.lower()
common_image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']
return any(path.endswith(extension) for extension in common_image_extensions) return any(path.endswith(extension) for extension in common_image_extensions)
def is_video_url(url): def is_video_url(url):
common_video_extensions = ['.mp4', '.webm']
mime_type = mime_type_using_head(url)
if mime_type:
mime_type_parts = mime_type.split('/')
return f'.{mime_type_parts[1]}' in common_video_extensions
else:
parsed_url = urlparse(url) parsed_url = urlparse(url)
path = parsed_url.path.lower() path = parsed_url.path.lower()
common_video_extensions = ['.mp4', '.webm']
return any(path.endswith(extension) for extension in common_video_extensions) return any(path.endswith(extension) for extension in common_video_extensions)
@cache.memoize(timeout=10)
def mime_type_using_head(url):
# Find the mime type of a url by doing a HEAD request - this is the same as GET except only the HTTP headers are transferred
try:
response = requests.head(url)
response.raise_for_status() # Raise an exception for HTTP errors
content_type = response.headers.get('Content-Type')
if content_type:
return content_type
else:
return ''
except requests.exceptions.RequestException as e:
return ''
# sanitise HTML using an allow list # sanitise HTML using an allow list
def allowlist_html(html: str) -> str: def allowlist_html(html: str) -> str:
if html is None or html == '': if html is None or html == '':