diff --git a/Dockerfile b/Dockerfile index 08a9130c..b10e111b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ FROM --platform=$BUILDPLATFORM python:3-alpine AS builder RUN apk update RUN apk add pkgconfig -RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng ffmpeg +RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng WORKDIR /app COPY . /app diff --git a/app/activitypub/util.py b/app/activitypub/util.py index c4aa0014..2fefb9ef 100644 --- a/app/activitypub/util.py +++ b/app/activitypub/util.py @@ -10,7 +10,6 @@ import httpx import redis from flask import current_app, request, g, url_for, json from flask_babel import _ -from requests import JSONDecodeError from sqlalchemy import text, func, desc from sqlalchemy.exc import IntegrityError @@ -29,7 +28,7 @@ import pytesseract from app.utils import get_request, allowlist_html, get_setting, ap_datetime, markdown_to_html, \ is_image_url, domain_from_url, gibberish, ensure_directory_exists, head_request, \ shorten_string, remove_tracking_from_link, \ - microblog_content_to_title, generate_image_from_video_url, is_video_url, \ + microblog_content_to_title, is_video_url, \ notification_subscribers, communities_banned_from, actor_contains_blocked_words, \ html_to_text, add_to_modlog_activitypub, joined_communities, \ moderating_communities, get_task_session, is_video_hosting_site, opengraph_parse @@ -1009,148 +1008,106 @@ def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory, to session = get_task_session() file: File = session.query(File).get(file_id) if file and file.source_url: - # Videos (old code. not invoked because file.source_url won't end .mp4 or .webm) - if file.source_url.endswith('.mp4') or file.source_url.endswith('.webm'): - new_filename = gibberish(15) - - # set up the storage directory - directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4] - ensure_directory_exists(directory) - - # file path and names to store the resized images on disk - final_place = os.path.join(directory, new_filename + '.jpg') - final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp') - try: - generate_image_from_video_url(file.source_url, final_place) - except Exception as e: - return - - if final_place: - image = Image.open(final_place) - img_width = image.width - - # Resize the image to medium - if medium_width: - if img_width > medium_width: - image.thumbnail((medium_width, medium_width)) - image.save(final_place) - file.file_path = final_place - file.width = image.width - file.height = image.height - - # Resize the image to a thumbnail (webp) - if thumbnail_width: - if img_width > thumbnail_width: - image.thumbnail((thumbnail_width, thumbnail_width)) - image.save(final_place_thumbnail, format="WebP", quality=93) - file.thumbnail_path = final_place_thumbnail - file.thumbnail_width = image.width - file.thumbnail_height = image.height - - session.commit() - - # Images + try: + source_image_response = get_request(file.source_url) + except: + pass else: - try: - source_image_response = get_request(file.source_url) - except: - pass - else: - if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url: - source_image_response.close() - # Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63 - # The un-proxied image url is the query parameter called 'url' - parsed_url = urlparse(file.source_url) - query_params = parse_qs(parsed_url.query) - if 'url' in query_params: - url_value = query_params['url'][0] - source_image_response = get_request(url_value) - else: - source_image_response = None - if source_image_response and source_image_response.status_code == 200: - content_type = source_image_response.headers.get('content-type') - if content_type: - if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')): - source_image = source_image_response.content - source_image_response.close() + if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url: + source_image_response.close() + # Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63 + # The un-proxied image url is the query parameter called 'url' + parsed_url = urlparse(file.source_url) + query_params = parse_qs(parsed_url.query) + if 'url' in query_params: + url_value = query_params['url'][0] + source_image_response = get_request(url_value) + else: + source_image_response = None + if source_image_response and source_image_response.status_code == 200: + content_type = source_image_response.headers.get('content-type') + if content_type: + if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')): + source_image = source_image_response.content + source_image_response.close() - content_type_parts = content_type.split('/') - if content_type_parts: - # content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8' + content_type_parts = content_type.split('/') + if content_type_parts: + # content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8' - # Remove ;charset=whatever - main_part = content_type.split(';')[0] + # Remove ;charset=whatever + main_part = content_type.split(';')[0] - # Split the main part on the '/' character and take the second part - file_ext = '.' + main_part.split('/')[1] - file_ext = file_ext.strip() # just to be sure + # Split the main part on the '/' character and take the second part + file_ext = '.' + main_part.split('/')[1] + file_ext = file_ext.strip() # just to be sure - if file_ext == '.jpeg': - file_ext = '.jpg' - elif file_ext == '.svg+xml': - return # no need to resize SVG images - elif file_ext == '.octet-stream': - file_ext = '.avif' - else: - file_ext = os.path.splitext(file.source_url)[1] - file_ext = file_ext.replace('%3f', '?') # sometimes urls are not decoded properly - if '?' in file_ext: - file_ext = file_ext.split('?')[0] + if file_ext == '.jpeg': + file_ext = '.jpg' + elif file_ext == '.svg+xml': + return # no need to resize SVG images + elif file_ext == '.octet-stream': + file_ext = '.avif' + else: + file_ext = os.path.splitext(file.source_url)[1] + file_ext = file_ext.replace('%3f', '?') # sometimes urls are not decoded properly + if '?' in file_ext: + file_ext = file_ext.split('?')[0] - new_filename = gibberish(15) + new_filename = gibberish(15) - # set up the storage directory - directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4] - ensure_directory_exists(directory) + # set up the storage directory + directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4] + ensure_directory_exists(directory) - # file path and names to store the resized images on disk - final_place = os.path.join(directory, new_filename + file_ext) - final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp') + # file path and names to store the resized images on disk + final_place = os.path.join(directory, new_filename + file_ext) + final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp') - if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary - import pillow_avif + if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary + import pillow_avif - # Load image data into Pillow - Image.MAX_IMAGE_PIXELS = 89478485 - image = Image.open(BytesIO(source_image)) - image = ImageOps.exif_transpose(image) - img_width = image.width - img_height = image.height + # Load image data into Pillow + Image.MAX_IMAGE_PIXELS = 89478485 + image = Image.open(BytesIO(source_image)) + image = ImageOps.exif_transpose(image) + img_width = image.width + img_height = image.height - # Resize the image to medium - if medium_width: - if img_width > medium_width: - image.thumbnail((medium_width, medium_width)) - image.save(final_place) - file.file_path = final_place - file.width = image.width - file.height = image.height + # Resize the image to medium + if medium_width: + if img_width > medium_width: + image.thumbnail((medium_width, medium_width)) + image.save(final_place) + file.file_path = final_place + file.width = image.width + file.height = image.height - # Resize the image to a thumbnail (webp) - if thumbnail_width: - if img_width > thumbnail_width: - image.thumbnail((thumbnail_width, thumbnail_width)) - image.save(final_place_thumbnail, format="WebP", quality=93) - file.thumbnail_path = final_place_thumbnail - file.thumbnail_width = image.width - file.thumbnail_height = image.height + # Resize the image to a thumbnail (webp) + if thumbnail_width: + if img_width > thumbnail_width: + image.thumbnail((thumbnail_width, thumbnail_width)) + image.save(final_place_thumbnail, format="WebP", quality=93) + file.thumbnail_path = final_place_thumbnail + file.thumbnail_width = image.width + file.thumbnail_height = image.height - session.commit() + session.commit() - # Alert regarding fascist meme content - if toxic_community and img_width < 2000: # images > 2000px tend to be real photos instead of 4chan screenshots. - try: - image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30) - except Exception as e: - image_text = '' - if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text): # chan posts usually contain the text 'Anonymous' and ' No.12345' - post = Post.query.filter_by(image_id=file.id).first() - notification = Notification(title='Review this', - user_id=1, - author_id=post.user_id, - url=url_for('activitypub.post_ap', post_id=post.id)) - session.add(notification) - session.commit() + # Alert regarding fascist meme content + if toxic_community and img_width < 2000: # images > 2000px tend to be real photos instead of 4chan screenshots. + try: + image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30) + except Exception as e: + image_text = '' + if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text): # chan posts usually contain the text 'Anonymous' and ' No.12345' + post = Post.query.filter_by(image_id=file.id).first() + notification = Notification(title='Review this', + user_id=1, + author_id=post.user_id, + url=url_for('activitypub.post_ap', post_id=post.id)) + session.add(notification) + session.commit() def find_reply_parent(in_reply_to: str) -> Tuple[int, int, int]: diff --git a/app/models.py b/app/models.py index ceae4a4a..1c8c3f26 100644 --- a/app/models.py +++ b/app/models.py @@ -1364,7 +1364,7 @@ class Post(db.Model): i += 1 db.session.commit() - if post.image_id: + if post.image_id and not post.type == constants.POST_TYPE_VIDEO: make_image_sizes(post.image_id, 170, 512, 'posts', community.low_quality) # the 512 sized image is for masonry view diff --git a/app/utils.py b/app/utils.py index 14bfdc06..45b50df1 100644 --- a/app/utils.py +++ b/app/utils.py @@ -4,7 +4,6 @@ import bisect import hashlib import mimetypes import random -import tempfile import urllib from collections import defaultdict from datetime import datetime, timedelta, date @@ -13,11 +12,9 @@ from typing import List, Literal, Union import httpx import markdown2 -import math from urllib.parse import urlparse, parse_qs, urlencode from functools import wraps import flask -import requests from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning import warnings import jwt @@ -34,7 +31,6 @@ from wtforms.fields import SelectField, SelectMultipleField from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput from app import db, cache, httpx_client import re -from moviepy.editor import VideoFileClip from PIL import Image, ImageOps from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \ @@ -1109,49 +1105,6 @@ def in_sorted_list(arr, target): return index < len(arr) and arr[index] == target -# Makes a still image from a video url, without downloading the whole video file -def generate_image_from_video_url(video_url, output_path, length=2): - - response = requests.get(video_url, stream=True, timeout=5, - headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0'}) # Imgur requires a user agent - content_type = response.headers.get('Content-Type') - if content_type: - if 'video/mp4' in content_type: - temp_file_extension = '.mp4' - elif 'video/webm' in content_type: - temp_file_extension = '.webm' - else: - raise ValueError("Unsupported video format") - else: - raise ValueError("Content-Type not found in response headers") - - # Generate a random temporary file name - temp_file_name = gibberish(15) + temp_file_extension - temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name) - - # Write the downloaded data to a temporary file - with open(temp_file_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=4096): - f.write(chunk) - if os.path.getsize(temp_file_path) >= length * 1024 * 1024: - break - - # Generate thumbnail from the temporary file - try: - clip = VideoFileClip(temp_file_path) - except Exception as e: - os.unlink(temp_file_path) - raise e - thumbnail = clip.get_frame(0) - clip.close() - - # Save the image - thumbnail_image = Image.fromarray(thumbnail) - thumbnail_image.save(output_path) - - os.remove(temp_file_path) - - @cache.memoize(timeout=600) def recently_upvoted_posts(user_id) -> List[int]: post_ids = db.session.execute(text('SELECT post_id FROM "post_vote" WHERE user_id = :user_id AND effect > 0 ORDER BY id DESC LIMIT 1000'), diff --git a/requirements.txt b/requirements.txt index 3f0cbefd..d2a5bcca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,3 @@ Werkzeug==2.3.3 pytesseract==0.3.10 sentry-sdk==1.40.6 python-slugify==8.0.4 -moviepy==1.0.3