remove moviepy and ffmpeg dependency

This commit is contained in:
rimu 2024-12-16 20:49:43 +13:00
parent 40777cd390
commit d73c12d4c7
5 changed files with 89 additions and 180 deletions

View file

@ -4,7 +4,7 @@ FROM --platform=$BUILDPLATFORM python:3-alpine AS builder
RUN apk update
RUN apk add pkgconfig
RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng ffmpeg
RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng
WORKDIR /app
COPY . /app

View file

@ -10,7 +10,6 @@ import httpx
import redis
from flask import current_app, request, g, url_for, json
from flask_babel import _
from requests import JSONDecodeError
from sqlalchemy import text, func, desc
from sqlalchemy.exc import IntegrityError
@ -29,7 +28,7 @@ import pytesseract
from app.utils import get_request, allowlist_html, get_setting, ap_datetime, markdown_to_html, \
is_image_url, domain_from_url, gibberish, ensure_directory_exists, head_request, \
shorten_string, remove_tracking_from_link, \
microblog_content_to_title, generate_image_from_video_url, is_video_url, \
microblog_content_to_title, is_video_url, \
notification_subscribers, communities_banned_from, actor_contains_blocked_words, \
html_to_text, add_to_modlog_activitypub, joined_communities, \
moderating_communities, get_task_session, is_video_hosting_site, opengraph_parse
@ -1009,148 +1008,106 @@ def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory, to
session = get_task_session()
file: File = session.query(File).get(file_id)
if file and file.source_url:
# Videos (old code. not invoked because file.source_url won't end .mp4 or .webm)
if file.source_url.endswith('.mp4') or file.source_url.endswith('.webm'):
new_filename = gibberish(15)
# set up the storage directory
directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory)
# file path and names to store the resized images on disk
final_place = os.path.join(directory, new_filename + '.jpg')
final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
try:
generate_image_from_video_url(file.source_url, final_place)
except Exception as e:
return
if final_place:
image = Image.open(final_place)
img_width = image.width
# Resize the image to medium
if medium_width:
if img_width > medium_width:
image.thumbnail((medium_width, medium_width))
image.save(final_place)
file.file_path = final_place
file.width = image.width
file.height = image.height
# Resize the image to a thumbnail (webp)
if thumbnail_width:
if img_width > thumbnail_width:
image.thumbnail((thumbnail_width, thumbnail_width))
image.save(final_place_thumbnail, format="WebP", quality=93)
file.thumbnail_path = final_place_thumbnail
file.thumbnail_width = image.width
file.thumbnail_height = image.height
session.commit()
# Images
try:
source_image_response = get_request(file.source_url)
except:
pass
else:
try:
source_image_response = get_request(file.source_url)
except:
pass
else:
if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url:
source_image_response.close()
# Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63
# The un-proxied image url is the query parameter called 'url'
parsed_url = urlparse(file.source_url)
query_params = parse_qs(parsed_url.query)
if 'url' in query_params:
url_value = query_params['url'][0]
source_image_response = get_request(url_value)
else:
source_image_response = None
if source_image_response and source_image_response.status_code == 200:
content_type = source_image_response.headers.get('content-type')
if content_type:
if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')):
source_image = source_image_response.content
source_image_response.close()
if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url:
source_image_response.close()
# Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63
# The un-proxied image url is the query parameter called 'url'
parsed_url = urlparse(file.source_url)
query_params = parse_qs(parsed_url.query)
if 'url' in query_params:
url_value = query_params['url'][0]
source_image_response = get_request(url_value)
else:
source_image_response = None
if source_image_response and source_image_response.status_code == 200:
content_type = source_image_response.headers.get('content-type')
if content_type:
if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')):
source_image = source_image_response.content
source_image_response.close()
content_type_parts = content_type.split('/')
if content_type_parts:
# content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8'
content_type_parts = content_type.split('/')
if content_type_parts:
# content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8'
# Remove ;charset=whatever
main_part = content_type.split(';')[0]
# Remove ;charset=whatever
main_part = content_type.split(';')[0]
# Split the main part on the '/' character and take the second part
file_ext = '.' + main_part.split('/')[1]
file_ext = file_ext.strip() # just to be sure
# Split the main part on the '/' character and take the second part
file_ext = '.' + main_part.split('/')[1]
file_ext = file_ext.strip() # just to be sure
if file_ext == '.jpeg':
file_ext = '.jpg'
elif file_ext == '.svg+xml':
return # no need to resize SVG images
elif file_ext == '.octet-stream':
file_ext = '.avif'
else:
file_ext = os.path.splitext(file.source_url)[1]
file_ext = file_ext.replace('%3f', '?') # sometimes urls are not decoded properly
if '?' in file_ext:
file_ext = file_ext.split('?')[0]
if file_ext == '.jpeg':
file_ext = '.jpg'
elif file_ext == '.svg+xml':
return # no need to resize SVG images
elif file_ext == '.octet-stream':
file_ext = '.avif'
else:
file_ext = os.path.splitext(file.source_url)[1]
file_ext = file_ext.replace('%3f', '?') # sometimes urls are not decoded properly
if '?' in file_ext:
file_ext = file_ext.split('?')[0]
new_filename = gibberish(15)
new_filename = gibberish(15)
# set up the storage directory
directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory)
# set up the storage directory
directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
ensure_directory_exists(directory)
# file path and names to store the resized images on disk
final_place = os.path.join(directory, new_filename + file_ext)
final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
# file path and names to store the resized images on disk
final_place = os.path.join(directory, new_filename + file_ext)
final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary
import pillow_avif
if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary
import pillow_avif
# Load image data into Pillow
Image.MAX_IMAGE_PIXELS = 89478485
image = Image.open(BytesIO(source_image))
image = ImageOps.exif_transpose(image)
img_width = image.width
img_height = image.height
# Load image data into Pillow
Image.MAX_IMAGE_PIXELS = 89478485
image = Image.open(BytesIO(source_image))
image = ImageOps.exif_transpose(image)
img_width = image.width
img_height = image.height
# Resize the image to medium
if medium_width:
if img_width > medium_width:
image.thumbnail((medium_width, medium_width))
image.save(final_place)
file.file_path = final_place
file.width = image.width
file.height = image.height
# Resize the image to medium
if medium_width:
if img_width > medium_width:
image.thumbnail((medium_width, medium_width))
image.save(final_place)
file.file_path = final_place
file.width = image.width
file.height = image.height
# Resize the image to a thumbnail (webp)
if thumbnail_width:
if img_width > thumbnail_width:
image.thumbnail((thumbnail_width, thumbnail_width))
image.save(final_place_thumbnail, format="WebP", quality=93)
file.thumbnail_path = final_place_thumbnail
file.thumbnail_width = image.width
file.thumbnail_height = image.height
# Resize the image to a thumbnail (webp)
if thumbnail_width:
if img_width > thumbnail_width:
image.thumbnail((thumbnail_width, thumbnail_width))
image.save(final_place_thumbnail, format="WebP", quality=93)
file.thumbnail_path = final_place_thumbnail
file.thumbnail_width = image.width
file.thumbnail_height = image.height
session.commit()
session.commit()
# Alert regarding fascist meme content
if toxic_community and img_width < 2000: # images > 2000px tend to be real photos instead of 4chan screenshots.
try:
image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
except Exception as e:
image_text = ''
if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text): # chan posts usually contain the text 'Anonymous' and ' No.12345'
post = Post.query.filter_by(image_id=file.id).first()
notification = Notification(title='Review this',
user_id=1,
author_id=post.user_id,
url=url_for('activitypub.post_ap', post_id=post.id))
session.add(notification)
session.commit()
# Alert regarding fascist meme content
if toxic_community and img_width < 2000: # images > 2000px tend to be real photos instead of 4chan screenshots.
try:
image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
except Exception as e:
image_text = ''
if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text): # chan posts usually contain the text 'Anonymous' and ' No.12345'
post = Post.query.filter_by(image_id=file.id).first()
notification = Notification(title='Review this',
user_id=1,
author_id=post.user_id,
url=url_for('activitypub.post_ap', post_id=post.id))
session.add(notification)
session.commit()
def find_reply_parent(in_reply_to: str) -> Tuple[int, int, int]:

View file

@ -1364,7 +1364,7 @@ class Post(db.Model):
i += 1
db.session.commit()
if post.image_id:
if post.image_id and not post.type == constants.POST_TYPE_VIDEO:
make_image_sizes(post.image_id, 170, 512, 'posts',
community.low_quality) # the 512 sized image is for masonry view

View file

@ -4,7 +4,6 @@ import bisect
import hashlib
import mimetypes
import random
import tempfile
import urllib
from collections import defaultdict
from datetime import datetime, timedelta, date
@ -13,11 +12,9 @@ from typing import List, Literal, Union
import httpx
import markdown2
import math
from urllib.parse import urlparse, parse_qs, urlencode
from functools import wraps
import flask
import requests
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
import jwt
@ -34,7 +31,6 @@ from wtforms.fields import SelectField, SelectMultipleField
from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
from app import db, cache, httpx_client
import re
from moviepy.editor import VideoFileClip
from PIL import Image, ImageOps
from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
@ -1109,49 +1105,6 @@ def in_sorted_list(arr, target):
return index < len(arr) and arr[index] == target
# Makes a still image from a video url, without downloading the whole video file
def generate_image_from_video_url(video_url, output_path, length=2):
response = requests.get(video_url, stream=True, timeout=5,
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0'}) # Imgur requires a user agent
content_type = response.headers.get('Content-Type')
if content_type:
if 'video/mp4' in content_type:
temp_file_extension = '.mp4'
elif 'video/webm' in content_type:
temp_file_extension = '.webm'
else:
raise ValueError("Unsupported video format")
else:
raise ValueError("Content-Type not found in response headers")
# Generate a random temporary file name
temp_file_name = gibberish(15) + temp_file_extension
temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
# Write the downloaded data to a temporary file
with open(temp_file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=4096):
f.write(chunk)
if os.path.getsize(temp_file_path) >= length * 1024 * 1024:
break
# Generate thumbnail from the temporary file
try:
clip = VideoFileClip(temp_file_path)
except Exception as e:
os.unlink(temp_file_path)
raise e
thumbnail = clip.get_frame(0)
clip.close()
# Save the image
thumbnail_image = Image.fromarray(thumbnail)
thumbnail_image.save(output_path)
os.remove(temp_file_path)
@cache.memoize(timeout=600)
def recently_upvoted_posts(user_id) -> List[int]:
post_ids = db.session.execute(text('SELECT post_id FROM "post_vote" WHERE user_id = :user_id AND effect > 0 ORDER BY id DESC LIMIT 1000'),

View file

@ -32,4 +32,3 @@ Werkzeug==2.3.3
pytesseract==0.3.10
sentry-sdk==1.40.6
python-slugify==8.0.4
moviepy==1.0.3