From abd4dd16c9dad194e447d018ec14a3dc271007a6 Mon Sep 17 00:00:00 2001
From: rimu <3310831+rimu@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:35:12 +1200
Subject: [PATCH] embed video and generate still thumbnail

---
 app/activitypub/util.py                 | 165 +++++++++++++++---------
 app/community/util.py                   |  42 +++---
 app/main/routes.py                      |   3 +-
 app/static/structure.css                |   5 +
 app/static/structure.scss               |   5 +
 app/templates/post/_post_full.html      |   9 ++
 app/utils.py                            |  43 +++++-
 docs/project_management/contributing.md |   5 +-
 requirements.txt                        |   1 +
 9 files changed, 193 insertions(+), 85 deletions(-)

diff --git a/app/activitypub/util.py b/app/activitypub/util.py
index 1a463a79..277641bb 100644
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@@ -27,7 +27,7 @@ import pytesseract
 from app.utils import get_request, allowlist_html, get_setting, ap_datetime, markdown_to_html, \
     is_image_url, domain_from_url, gibberish, ensure_directory_exists, markdown_to_text, head_request, post_ranking, \
     shorten_string, reply_already_exists, reply_is_just_link_to_gif_reaction, confidence, remove_tracking_from_link, \
-    blocked_phrases, microblog_content_to_title
+    blocked_phrases, microblog_content_to_title, generate_image_from_video_url
 
 
 def public_key():
@@ -738,78 +738,117 @@ def make_image_sizes(file_id, thumbnail_width=50, medium_width=120, directory='p
 def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory):
     file = File.query.get(file_id)
     if file and file.source_url:
-        try:
-            source_image_response = get_request(file.source_url)
-        except:
-            pass
+        # Videos
+        if file.source_url.endswith('.mp4') or file.source_url.endswith('.webm'):
+            new_filename = gibberish(15)
+
+            # set up the storage directory
+            directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
+            ensure_directory_exists(directory)
+
+            # file path and names to store the resized images on disk
+            final_place = os.path.join(directory, new_filename + '.jpg')
+            final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
+
+            generate_image_from_video_url(file.source_url, final_place)
+
+            image = Image.open(final_place)
+            img_width = image.width
+
+            # Resize the image to medium
+            if medium_width:
+                if img_width > medium_width:
+                    image.thumbnail((medium_width, medium_width))
+                image.save(final_place)
+                file.file_path = final_place
+                file.width = image.width
+                file.height = image.height
+
+            # Resize the image to a thumbnail (webp)
+            if thumbnail_width:
+                if img_width > thumbnail_width:
+                    image.thumbnail((thumbnail_width, thumbnail_width))
+                image.save(final_place_thumbnail, format="WebP", quality=93)
+                file.thumbnail_path = final_place_thumbnail
+                file.thumbnail_width = image.width
+                file.thumbnail_height = image.height
+
+            db.session.commit()
+
+        # Images
         else:
-            if source_image_response.status_code == 200:
-                content_type = source_image_response.headers.get('content-type')
-                if content_type and content_type.startswith('image'):
-                    source_image = source_image_response.content
-                    source_image_response.close()
+            try:
+                source_image_response = get_request(file.source_url)
+            except:
+                pass
+            else:
+                if source_image_response.status_code == 200:
+                    content_type = source_image_response.headers.get('content-type')
+                    if content_type and content_type.startswith('image'):
+                        source_image = source_image_response.content
+                        source_image_response.close()
 
-                    file_ext = os.path.splitext(file.source_url)[1]
-                    # fall back to parsing the http content type if the url does not contain a file extension
-                    if file_ext == '':
-                        content_type_parts = content_type.split('/')
-                        if content_type_parts:
-                            file_ext = '.' + content_type_parts[-1]
-                    else:
-                        if '?' in file_ext:
-                            file_ext = file_ext.split('?')[0]
+                        file_ext = os.path.splitext(file.source_url)[1]
+                        # fall back to parsing the http content type if the url does not contain a file extension
+                        if file_ext == '':
+                            content_type_parts = content_type.split('/')
+                            if content_type_parts:
+                                file_ext = '.' + content_type_parts[-1]
+                        else:
+                            if '?' in file_ext:
+                                file_ext = file_ext.split('?')[0]
 
-                    new_filename = gibberish(15)
+                        new_filename = gibberish(15)
 
-                    # set up the storage directory
-                    directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
-                    ensure_directory_exists(directory)
+                        # set up the storage directory
+                        directory = f'app/static/media/{directory}/' + new_filename[0:2] + '/' + new_filename[2:4]
+                        ensure_directory_exists(directory)
 
-                    # file path and names to store the resized images on disk
-                    final_place = os.path.join(directory, new_filename + file_ext)
-                    final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
+                        # file path and names to store the resized images on disk
+                        final_place = os.path.join(directory, new_filename + file_ext)
+                        final_place_thumbnail = os.path.join(directory, new_filename + '_thumbnail.webp')
 
-                    # Load image data into Pillow
-                    Image.MAX_IMAGE_PIXELS = 89478485
-                    image = Image.open(BytesIO(source_image))
-                    image = ImageOps.exif_transpose(image)
-                    img_width = image.width
-                    img_height = image.height
+                        # Load image data into Pillow
+                        Image.MAX_IMAGE_PIXELS = 89478485
+                        image = Image.open(BytesIO(source_image))
+                        image = ImageOps.exif_transpose(image)
+                        img_width = image.width
+                        img_height = image.height
 
-                    # Resize the image to medium
-                    if medium_width:
-                        if img_width > medium_width:
-                            image.thumbnail((medium_width, medium_width))
-                        image.save(final_place)
-                        file.file_path = final_place
-                        file.width = image.width
-                        file.height = image.height
+                        # Resize the image to medium
+                        if medium_width:
+                            if img_width > medium_width:
+                                image.thumbnail((medium_width, medium_width))
+                            image.save(final_place)
+                            file.file_path = final_place
+                            file.width = image.width
+                            file.height = image.height
 
-                    # Resize the image to a thumbnail (webp)
-                    if thumbnail_width:
-                        if img_width > thumbnail_width:
-                            image.thumbnail((thumbnail_width, thumbnail_width))
-                        image.save(final_place_thumbnail, format="WebP", quality=93)
-                        file.thumbnail_path = final_place_thumbnail
-                        file.thumbnail_width = image.width
-                        file.thumbnail_height = image.height
+                        # Resize the image to a thumbnail (webp)
+                        if thumbnail_width:
+                            if img_width > thumbnail_width:
+                                image.thumbnail((thumbnail_width, thumbnail_width))
+                            image.save(final_place_thumbnail, format="WebP", quality=93)
+                            file.thumbnail_path = final_place_thumbnail
+                            file.thumbnail_width = image.width
+                            file.thumbnail_height = image.height
 
-                    db.session.commit()
+                        db.session.commit()
 
-                    # Alert regarding fascist meme content
-                    if img_width < 2000:    # images > 2000px tend to be real photos instead of 4chan screenshots.
-                        try:
-                            image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
-                        except FileNotFoundError as e:
-                            image_text = ''
-                        if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text):   # chan posts usually contain the text 'Anonymous' and ' No.12345'
-                            post = Post.query.filter_by(image_id=file.id).first()
-                            notification = Notification(title='Review this',
-                                                        user_id=1,
-                                                        author_id=post.user_id,
-                                                        url=url_for('activitypub.post_ap', post_id=post.id))
-                            db.session.add(notification)
-                            db.session.commit()
+                        # Alert regarding fascist meme content
+                        if img_width < 2000:    # images > 2000px tend to be real photos instead of 4chan screenshots.
+                            try:
+                                image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'), timeout=30)
+                            except FileNotFoundError as e:
+                                image_text = ''
+                            if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text):   # chan posts usually contain the text 'Anonymous' and ' No.12345'
+                                post = Post.query.filter_by(image_id=file.id).first()
+                                notification = Notification(title='Review this',
+                                                            user_id=1,
+                                                            author_id=post.user_id,
+                                                            url=url_for('activitypub.post_ap', post_id=post.id))
+                                db.session.add(notification)
+                                db.session.commit()
 
 
 # create a summary from markdown if present, otherwise use html if available
diff --git a/app/community/util.py b/app/community/util.py
index c58dcad9..7b438594 100644
--- a/app/community/util.py
+++ b/app/community/util.py
@@ -112,7 +112,8 @@ def retrieve_mods_and_backfill(community_id: int):
                                 post.ranking = post_ranking(post.score, post.posted_at)
                                 if post.url:
                                     other_posts = Post.query.filter(Post.id != post.id, Post.url == post.url,
-                                            Post.posted_at > post.posted_at - timedelta(days=3), Post.posted_at < post.posted_at + timedelta(days=3)).all()
+                                                                    Post.posted_at > post.posted_at - timedelta(days=3),
+                                                                    Post.posted_at < post.posted_at + timedelta(days=3)).all()
                                     for op in other_posts:
                                         if op.cross_posts is None:
                                             op.cross_posts = [post.id]
@@ -223,26 +224,31 @@ def save_post(form, post: Post, type: str):
                 remove_old_file(post.image_id)
                 post.image_id = None
 
-            unused, file_extension = os.path.splitext(form.link_url.data)
-            # this url is a link to an image - turn it into a image post
-            if file_extension.lower() in allowed_extensions:
-                file = File(source_url=form.link_url.data)
+            if post.url.endswith('.mp4') or post.url.endswith('.webm'):
+                file = File(source_url=form.link_url.data)  # make_image_sizes() will take care of turning this into a still image
                 post.image = file
                 db.session.add(file)
-                post.type = POST_TYPE_IMAGE
             else:
-                # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
-                opengraph = opengraph_parse(form.link_url.data)
-                if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
-                    filename = opengraph.get('og:image') or opengraph.get('og:image:url')
-                    filename_for_extension = filename.split('?')[0] if '?' in filename else filename
-                    unused, file_extension = os.path.splitext(filename_for_extension)
-                    if file_extension.lower() in allowed_extensions and not filename.startswith('/'):
-                        file = url_to_thumbnail_file(filename)
-                        if file:
-                            file.alt_text = shorten_string(opengraph.get('og:title'), 295)
-                            post.image = file
-                            db.session.add(file)
+                unused, file_extension = os.path.splitext(form.link_url.data)
+                # this url is a link to an image - turn it into a image post
+                if file_extension.lower() in allowed_extensions:
+                    file = File(source_url=form.link_url.data)
+                    post.image = file
+                    db.session.add(file)
+                    post.type = POST_TYPE_IMAGE
+                else:
+                    # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
+                    opengraph = opengraph_parse(form.link_url.data)
+                    if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
+                        filename = opengraph.get('og:image') or opengraph.get('og:image:url')
+                        filename_for_extension = filename.split('?')[0] if '?' in filename else filename
+                        unused, file_extension = os.path.splitext(filename_for_extension)
+                        if file_extension.lower() in allowed_extensions and not filename.startswith('/'):
+                            file = url_to_thumbnail_file(filename)
+                            if file:
+                                file.alt_text = shorten_string(opengraph.get('og:title'), 295)
+                                post.image = file
+                                db.session.add(file)
 
     elif type == 'image':
         post.title = form.image_title.data
diff --git a/app/main/routes.py b/app/main/routes.py
index d90f55d3..74bf30d7 100644
--- a/app/main/routes.py
+++ b/app/main/routes.py
@@ -25,7 +25,8 @@ from sqlalchemy_searchable import search
 from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \
     ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
     joined_communities, moderating_communities, parse_page, theme_list, get_request, markdown_to_html, allowlist_html, \
-    blocked_instances, communities_banned_from, topic_tree, recently_upvoted_posts, recently_downvoted_posts
+    blocked_instances, communities_banned_from, topic_tree, recently_upvoted_posts, recently_downvoted_posts, \
+    generate_image_from_video_url
 from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic, File, Instance, \
     InstanceRole, Notification
 from PIL import Image
diff --git a/app/static/structure.css b/app/static/structure.css
index 64cbe713..b66f05f9 100644
--- a/app/static/structure.css
+++ b/app/static/structure.css
@@ -1384,4 +1384,9 @@ h1 .warning_badge {
   max-width: 100%;
 }
 
+.responsive-video {
+  max-width: 100%;
+  max-height: 90vh;
+}
+
 /*# sourceMappingURL=structure.css.map */
diff --git a/app/static/structure.scss b/app/static/structure.scss
index 8152ec92..79019d4f 100644
--- a/app/static/structure.scss
+++ b/app/static/structure.scss
@@ -1057,4 +1057,9 @@ h1 .warning_badge {
     line-height: initial;
     max-width: 100%;
   }
+}
+
+.responsive-video {
+  max-width: 100%;
+  max-height: 90vh;
 }
\ No newline at end of file
diff --git a/app/templates/post/_post_full.html b/app/templates/post/_post_full.html
index b0d1c618..891700f3 100644
--- a/app/templates/post/_post_full.html
+++ b/app/templates/post/_post_full.html
@@ -83,6 +83,15 @@
                 <span class="fe fe-external"></span></a></p>
                 {% if post.url.endswith('.mp3') %}
                     <p><audio controls preload="{{ 'none' if low_bandwidth else 'metadata' }}" src="{{ post.url }}"></audio></p>
+                {% elif post.url.endswith('.mp4') or post.url.endswith('.webm') %}
+                    <p>
+                        <video class="responsive-video" controls preload="{{ 'metadata' if low_bandwidth else 'auto' }}">
+                        {% if post.url.endswith('.mp4') %}
+                            <source src="{{ post.url }}" media="video/mp4" />
+                        {% elif post.url.endswith('.webm') %}
+                            <source src="{{ post.url }}" media="video/webm" />
+                        {% endif %}
+                        </video></p>
                 {% endif %}
                 {% if 'youtube.com' in post.url %}
                     <p><a href="https://piped.video/watch?v={{ post.youtube_embed() }}">{{ _('Watch on piped.video') }} <span class="fe fe-external"></span></a></p>
diff --git a/app/utils.py b/app/utils.py
index f7d427cf..c354b085 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -4,6 +4,7 @@ import bisect
 import hashlib
 import mimetypes
 import random
+import tempfile
 import urllib
 from collections import defaultdict
 from datetime import datetime, timedelta, date
@@ -14,7 +15,7 @@ import math
 from urllib.parse import urlparse, parse_qs, urlencode
 from functools import wraps
 import flask
-from bs4 import BeautifulSoup, NavigableString, MarkupResemblesLocatorWarning
+from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 import warnings
 warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 import requests
@@ -26,6 +27,8 @@ from wtforms.fields  import SelectField, SelectMultipleField
 from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
 from app import db, cache
 import re
+from moviepy.editor import VideoFileClip
+from PIL import Image
 
 from app.email import send_welcome_email
 from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
@@ -881,6 +884,44 @@ def in_sorted_list(arr, target):
     return index < len(arr) and arr[index] == target
 
 
+# Makes a still image from a video url, without downloading the whole video file
+def generate_image_from_video_url(video_url, output_path, length=2):
+
+    response = requests.get(video_url, stream=True)
+    content_type = response.headers.get('Content-Type')
+    if content_type:
+        if 'video/mp4' in content_type:
+            temp_file_extension = '.mp4'
+        elif 'video/webm' in content_type:
+            temp_file_extension = '.webm'
+        else:
+            raise ValueError("Unsupported video format")
+    else:
+        raise ValueError("Content-Type not found in response headers")
+
+    # Generate a random temporary file name
+    temp_file_name = gibberish(15) + temp_file_extension
+    temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
+
+    # Write the downloaded data to a temporary file
+    with open(temp_file_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=4096):
+            f.write(chunk)
+            if os.path.getsize(temp_file_path) >= length * 1024 * 1024:
+                break
+
+    # Generate thumbnail from the temporary file
+    clip = VideoFileClip(temp_file_path)
+    thumbnail = clip.get_frame(0)
+    clip.close()
+
+    # Save the image
+    thumbnail_image = Image.fromarray(thumbnail)
+    thumbnail_image.save(output_path)
+
+    os.remove(temp_file_path)
+
+
 @cache.memoize(timeout=600)
 def recently_upvoted_posts(user_id) -> List[int]:
     post_ids = db.session.execute(text('SELECT post_id FROM "post_vote" WHERE user_id = :user_id AND effect > 0 ORDER BY id DESC LIMIT 1000'),
diff --git a/docs/project_management/contributing.md b/docs/project_management/contributing.md
index f09ea3d3..92bfeff1 100644
--- a/docs/project_management/contributing.md
+++ b/docs/project_management/contributing.md
@@ -34,9 +34,10 @@ time of things.
 # Coding Standards / Guidelines
 
 **[PEP 8](https://peps.python.org/pep-0008/)** covers the basics. PyCharm encourages this by default - 
-VS Code coders are encouraged to try the free community edition of PyCharm but it is by no means required. 
+VS Code coders may like to try the free community edition of PyCharm but it is by no means required.
 
-Use PEP 8 conventions for line length, naming, indentation. Use descriptive commit messages.
+Use PEP 8 conventions for naming, indentation. Use descriptive commit messages. Try to limit lines of code
+to a length of roughly 120 characters.
 
 Database model classes are singular. As in "Car", not "Cars".
 
diff --git a/requirements.txt b/requirements.txt
index fad9be58..338bdd49 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,3 +32,4 @@ Werkzeug==2.3.3
 pytesseract==0.3.10
 sentry-sdk==1.40.6
 python-slugify==8.0.4
+moviepy==1.0.3