remove moviepy and ffmpeg dependency

2025-01-23 11:26:56 -08:00 · 2024-12-16 20:49:43 +13:00 · 2024-12-16 20:49:43 +13:00 · d73c12d4c7
commit d73c12d4c7
parent 40777cd390
5 changed files with 89 additions and 180 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ FROM --platform=$BUILDPLATFORM python:3-alpine AS builder

 RUN apk update
 RUN apk add pkgconfig
-RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng ffmpeg
+RUN apk add --virtual build-deps gcc python3-dev musl-dev tesseract-ocr tesseract-ocr-data-eng

 WORKDIR /app
 COPY . /app
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@ -10,7 +10,6 @@ import httpx
 import redis
 from flask import current_app, request, g, url_for, json
 from flask_babel import _
-from requests import JSONDecodeError
 from sqlalchemy import text, func, desc
 from sqlalchemy.exc import IntegrityError

@ -29,7 +28,7 @@ import pytesseract
 from app.utils import get_request, allowlist_html, get_setting, ap_datetime, markdown_to_html, \
    is_image_url, domain_from_url, gibberish, ensure_directory_exists, head_request, \
    shorten_string, remove_tracking_from_link, \
-    microblog_content_to_title, generate_image_from_video_url, is_video_url, \
+    microblog_content_to_title, is_video_url, \
    notification_subscribers, communities_banned_from, actor_contains_blocked_words, \
    html_to_text, add_to_modlog_activitypub, joined_communities, \
    moderating_communities, get_task_session, is_video_hosting_site, opengraph_parse
@ -1009,148 +1008,106 @@ def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory, to
    session = get_task_session()
    file: File = session.query(File).get(file_id)
    if file and file.source_url:
-        # Videos (old code. not invoked because file.source_url won't end .mp4 or .webm)
-        if file.source_url.endswith('.mp4') or file.source_url.endswith('.webm'):
-            new_filename = gibberish(15)
-
-            # set up the storage directory
-            directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
-            ensure_directory_exists(directory)
-
-            # file path and names to store the resized images on disk
-            final_place = os.path.join(directory, new_filename + '.jpg')
-            final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
-            try:
-                generate_image_from_video_url(file.source_url, final_place)
-            except Exception as e:
-                return
-
-            if final_place:
-                image = Image.open(final_place)
-                img_width = image.width
-
-                # Resize the image to medium
-                if medium_width:
-                    if img_width > medium_width:
-                        image.thumbnail((medium_width, medium_width))
-                    image.save(final_place)
-                    file.file_path = final_place
-                    file.width = image.width
-                    file.height = image.height
-
-                # Resize the image to a thumbnail (webp)
-                if thumbnail_width:
-                    if img_width > thumbnail_width:
-                        image.thumbnail((thumbnail_width, thumbnail_width))
-                    image.save(final_place_thumbnail, format="WebP", quality=93)
-                    file.thumbnail_path = final_place_thumbnail
-                    file.thumbnail_width = image.width
-                    file.thumbnail_height = image.height
-
-                session.commit()
-
-        # Images
+        try:
+            source_image_response = get_request(file.source_url)
+        except:
+            pass
        else:
-            try:
-                source_image_response = get_request(file.source_url)
-            except:
-                pass
-            else:
-                if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url:
-                    source_image_response.close()
-                    # Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63
-                    # The un-proxied image url is the query parameter called 'url'
-                    parsed_url = urlparse(file.source_url)
-                    query_params = parse_qs(parsed_url.query)
-                    if 'url' in query_params:
-                        url_value = query_params['url'][0]
-                        source_image_response = get_request(url_value)
-                    else:
-                        source_image_response = None
-                if source_image_response and source_image_response.status_code == 200:
-                    content_type = source_image_response.headers.get('content-type')
-                    if content_type:
-                        if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')):
-                            source_image = source_image_response.content
-                            source_image_response.close()
+            if source_image_response.status_code == 404 and '/api/v3/image_proxy' in file.source_url:
+                source_image_response.close()
+                # Lemmy failed to retrieve the image but we might have better luck. Example source_url: https://slrpnk.net/api/v3/image_proxy?url=https%3A%2F%2Fi.guim.co.uk%2Fimg%2Fmedia%2F24e87cb4d730141848c339b3b862691ca536fb26%2F0_164_3385_2031%2Fmaster%2F3385.jpg%3Fwidth%3D1200%26height%3D630%26quality%3D85%26auto%3Dformat%26fit%3Dcrop%26overlay-align%3Dbottom%252Cleft%26overlay-width%3D100p%26overlay-base64%3DL2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctZGVmYXVsdC5wbmc%26enable%3Dupscale%26s%3D0ec9d25a8cb5db9420471054e26cfa63
+                # The un-proxied image url is the query parameter called 'url'
+                parsed_url = urlparse(file.source_url)
+                query_params = parse_qs(parsed_url.query)
+                if 'url' in query_params:
+                    url_value = query_params['url'][0]
+                    source_image_response = get_request(url_value)
+                else:
+                    source_image_response = None
+            if source_image_response and source_image_response.status_code == 200:
+                content_type = source_image_response.headers.get('content-type')
+                if content_type:
+                    if content_type.startswith('image') or (content_type == 'application/octet-stream' and file.source_url.endswith('.avif')):
+                        source_image = source_image_response.content
+                        source_image_response.close()

-                            content_type_parts = content_type.split('/')
-                            if content_type_parts:
-                                # content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8'
+                        content_type_parts = content_type.split('/')
+                        if content_type_parts:
+                            # content type headers often are just 'image/jpeg' but sometimes 'image/jpeg;charset=utf8'

-                                # Remove ;charset=whatever
-                                main_part = content_type.split(';')[0]
+                            # Remove ;charset=whatever
+                            main_part = content_type.split(';')[0]

-                                # Split the main part on the '/' character and take the second part
-                                file_ext = '.' + main_part.split('/')[1]
-                                file_ext = file_ext.strip() # just to be sure
+                            # Split the main part on the '/' character and take the second part
+                            file_ext = '.' + main_part.split('/')[1]
+                            file_ext = file_ext.strip() # just to be sure

-                                if file_ext == '.jpeg':
-                                    file_ext = '.jpg'
-                                elif file_ext == '.svg+xml':
-                                    return  # no need to resize SVG images
-                                elif file_ext == '.octet-stream':
-                                    file_ext = '.avif'
-                            else:
-                                file_ext = os.path.splitext(file.source_url)[1]
-                                file_ext = file_ext.replace('%3f', '?')  # sometimes urls are not decoded properly
-                                if '?' in file_ext:
-                                    file_ext = file_ext.split('?')[0]
+                            if file_ext == '.jpeg':
+                                file_ext = '.jpg'
+                            elif file_ext == '.svg+xml':
+                                return  # no need to resize SVG images
+                            elif file_ext == '.octet-stream':
+                                file_ext = '.avif'
+                        else:
+                            file_ext = os.path.splitext(file.source_url)[1]
+                            file_ext = file_ext.replace('%3f', '?')  # sometimes urls are not decoded properly
+                            if '?' in file_ext:
+                                file_ext = file_ext.split('?')[0]

-                            new_filename = gibberish(15)
+                        new_filename = gibberish(15)

-                            # set up the storage directory
-                            directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
-                            ensure_directory_exists(directory)
+                        # set up the storage directory
+                        directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
+                        ensure_directory_exists(directory)

-                            # file path and names to store the resized images on disk
-                            final_place = os.path.join(directory, new_filename + file_ext)
-                            final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
+                        # file path and names to store the resized images on disk
+                        final_place = os.path.join(directory, new_filename + file_ext)
+                        final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')

-                            if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary
-                                import pillow_avif
+                        if file_ext == '.avif': # this is quite a big plugin so we'll only load it if necessary
+                            import pillow_avif

-                            # Load image data into Pillow
-                            Image.MAX_IMAGE_PIXELS = 89478485
-                            image = Image.open(BytesIO(source_image))
-                            image = ImageOps.exif_transpose(image)
-                            img_width = image.width
-                            img_height = image.height
+                        # Load image data into Pillow
+                        Image.MAX_IMAGE_PIXELS = 89478485
+                        image = Image.open(BytesIO(source_image))
+                        image = ImageOps.exif_transpose(image)
+                        img_width = image.width
+                        img_height = image.height

-                            # Resize the image to medium
-                            if medium_width:
-                                if img_width > medium_width:
-                                    image.thumbnail((medium_width, medium_width))
-                                image.save(final_place)
-                                file.file_path = final_place
-                                file.width = image.width
-                                file.height = image.height
+                        # Resize the image to medium
+                        if medium_width:
+                            if img_width > medium_width:
+                                image.thumbnail((medium_width, medium_width))
+                            image.save(final_place)
+                            file.file_path = final_place
+                            file.width = image.width
+                            file.height = image.height

-                            # Resize the image to a thumbnail (webp)
-                            if thumbnail_width:
-                                if img_width > thumbnail_width:
-                                    image.thumbnail((thumbnail_width, thumbnail_width))
-                                image.save(final_place_thumbnail, format="WebP", quality=93)
-                                file.thumbnail_path = final_place_thumbnail
-                                file.thumbnail_width = image.width
-                                file.thumbnail_height = image.height
+                        # Resize the image to a thumbnail (webp)
+                        if thumbnail_width:
+                            if img_width > thumbnail_width:
+                                image.thumbnail((thumbnail_width, thumbnail_width))
+                            image.save(final_place_thumbnail, format="WebP", quality=93)
+                            file.thumbnail_path = final_place_thumbnail
+                            file.thumbnail_width = image.width
+                            file.thumbnail_height = image.height

-                            session.commit()
+                        session.commit()

-                            # Alert regarding fascist meme content
-                            if toxic_community and img_width < 2000:    # images > 2000px tend to be real photos instead of 4chan screenshots.
-                                try:
-                                    image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
-                                except Exception as e:
-                                    image_text = ''
-                                if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text):   # chan posts usually contain the text 'Anonymous' and ' No.12345'
-                                    post = Post.query.filter_by(image_id=file.id).first()
-                                    notification = Notification(title='Review this',
-                                                                user_id=1,
-                                                                author_id=post.user_id,
-                                                                url=url_for('activitypub.post_ap', post_id=post.id))
-                                    session.add(notification)
-                                    session.commit()
+                        # Alert regarding fascist meme content
+                        if toxic_community and img_width < 2000:    # images > 2000px tend to be real photos instead of 4chan screenshots.
+                            try:
+                                image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
+                            except Exception as e:
+                                image_text = ''
+                            if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text):   # chan posts usually contain the text 'Anonymous' and ' No.12345'
+                                post = Post.query.filter_by(image_id=file.id).first()
+                                notification = Notification(title='Review this',
+                                                            user_id=1,
+                                                            author_id=post.user_id,
+                                                            url=url_for('activitypub.post_ap', post_id=post.id))
+                                session.add(notification)
+                                session.commit()


 def find_reply_parent(in_reply_to: str) -> Tuple[int, int, int]:
--- a/app/models.py
+++ b/app/models.py
@ -1364,7 +1364,7 @@ class Post(db.Model):
                    i += 1
                db.session.commit()

-            if post.image_id:
+            if post.image_id and not post.type == constants.POST_TYPE_VIDEO:
                make_image_sizes(post.image_id, 170, 512, 'posts',
                                 community.low_quality)  # the 512 sized image is for masonry view

--- a/app/utils.py
+++ b/app/utils.py
@ -4,7 +4,6 @@ import bisect
 import hashlib
 import mimetypes
 import random
-import tempfile
 import urllib
 from collections import defaultdict
 from datetime import datetime, timedelta, date
@ -13,11 +12,9 @@ from typing import List, Literal, Union

 import httpx
 import markdown2
-import math
 from urllib.parse import urlparse, parse_qs, urlencode
 from functools import wraps
 import flask
-import requests
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 import warnings
 import jwt
@ -34,7 +31,6 @@ from wtforms.fields  import SelectField, SelectMultipleField
 from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
 from app import db, cache, httpx_client
 import re
-from moviepy.editor import VideoFileClip
 from PIL import Image, ImageOps

 from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
@ -1109,49 +1105,6 @@ def in_sorted_list(arr, target):
    return index < len(arr) and arr[index] == target


-# Makes a still image from a video url, without downloading the whole video file
-def generate_image_from_video_url(video_url, output_path, length=2):
-
-    response = requests.get(video_url, stream=True, timeout=5,
-                            headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0'})  # Imgur requires a user agent
-    content_type = response.headers.get('Content-Type')
-    if content_type:
-        if 'video/mp4' in content_type:
-            temp_file_extension = '.mp4'
-        elif 'video/webm' in content_type:
-            temp_file_extension = '.webm'
-        else:
-            raise ValueError("Unsupported video format")
-    else:
-        raise ValueError("Content-Type not found in response headers")
-
-    # Generate a random temporary file name
-    temp_file_name = gibberish(15) + temp_file_extension
-    temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
-
-    # Write the downloaded data to a temporary file
-    with open(temp_file_path, 'wb') as f:
-        for chunk in response.iter_content(chunk_size=4096):
-            f.write(chunk)
-            if os.path.getsize(temp_file_path) >= length * 1024 * 1024:
-                break
-
-    # Generate thumbnail from the temporary file
-    try:
-        clip = VideoFileClip(temp_file_path)
-    except Exception as e:
-        os.unlink(temp_file_path)
-        raise e
-    thumbnail = clip.get_frame(0)
-    clip.close()
-
-    # Save the image
-    thumbnail_image = Image.fromarray(thumbnail)
-    thumbnail_image.save(output_path)
-
-    os.remove(temp_file_path)
-
-
@cache.memoize(timeout=600)
 def recently_upvoted_posts(user_id) -> List[int]:
    post_ids = db.session.execute(text('SELECT post_id FROM "post_vote" WHERE user_id = :user_id AND effect > 0 ORDER BY id DESC LIMIT 1000'),
--- a/requirements.txt
+++ b/requirements.txt
@ -32,4 +32,3 @@ Werkzeug==2.3.3
 pytesseract==0.3.10
 sentry-sdk==1.40.6
 python-slugify==8.0.4
-moviepy==1.0.3