From d69dac7631430d834b4070687a666d7842365721 Mon Sep 17 00:00:00 2001
From: rimu <3310831+rimu@users.noreply.github.com>
Date: Sun, 4 Feb 2024 22:02:32 +1300
Subject: [PATCH] set user-agent while parsing og:image meta tag

---
 app/community/util.py | 10 ++++-----
 app/main/routes.py    |  5 ++++-
 app/utils.py          | 47 +++++++++++++++++++++++++++++++++++++++++++
 requirements.txt      |  1 -
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/app/community/util.py b/app/community/util.py
index 57122abe..9ba37184 100644
--- a/app/community/util.py
+++ b/app/community/util.py
@@ -15,10 +15,9 @@ from app.constants import POST_TYPE_ARTICLE, POST_TYPE_LINK, POST_TYPE_IMAGE
 from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \
     Instance, Notification, User
 from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
-    html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string
+    html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page
 from sqlalchemy import desc, text
 import os
-from opengraph_parse import parse_page
 
 
 allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic']
@@ -130,7 +129,6 @@ def actor_to_community(actor) -> Community:
     return community
 
 
-@cache.memoize(timeout=50)
 def opengraph_parse(url):
     if '?' in url:
         url = url.split('?')
@@ -199,8 +197,10 @@ def save_post(form, post: Post):
             else:
                 # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
                 opengraph = opengraph_parse(form.link_url.data)
-                if opengraph and opengraph.get('og:image', '') != '':
-                    filename = opengraph.get('og:image')
+                if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
+                    filename = opengraph.get('og:image') or opengraph.get('og:image:url')
+                    if '?' in filename:
+                        filename = filename.split('?')[0]
                     unused, file_extension = os.path.splitext(filename)
                     if file_extension.lower() in allowed_extensions:
                         file = url_to_thumbnail_file(filename)
diff --git a/app/main/routes.py b/app/main/routes.py
index 8f2e8517..6a2d82d2 100644
--- a/app/main/routes.py
+++ b/app/main/routes.py
@@ -18,7 +18,7 @@ from sqlalchemy import select, desc, text
 from sqlalchemy_searchable import search
 from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \
     ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
-    joined_communities, moderating_communities
+    joined_communities, moderating_communities, parse_page
 from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic
 from PIL import Image
 import pytesseract
@@ -249,6 +249,9 @@ def keyboard_shortcuts():
 @bp.route('/test')
 def test():
 
+    x = parse_page('https://slate.com/technology/2024/02/quora-what-happened-ai-decline.html')
+    return str(x)
+
     return current_app.config['SERVER_NAME']
 
     #ip = request.headers.get('X-Forwarded-For') or request.remote_addr
diff --git a/app/utils.py b/app/utils.py
index eb008b15..71228e10 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -622,3 +622,50 @@ def confidence(ups, downs) -> float:
         return 0.0
     else:
         return _confidence(ups, downs)
+
+
+# By no means is this a complete list, but it is very easy to search for the ones you need later.
+KNOWN_OPENGRAPH_TAGS = [
+    "og:site_name",
+    "og:title",
+    "og:locale",
+    "og:type",
+    "og:image",
+    "og:url",
+    "og:image:url",
+    "og:image:secure_url",
+    "og:image:type",
+    "og:image:width",
+    "og:image:height",
+    "og:image:alt",
+    ]
+
+
+def parse_page(page_url, tags_to_search = KNOWN_OPENGRAPH_TAGS, fallback_tags = None):
+    '''
+    Parses a page, returns a JSON style dictionary of all OG tags found on that page.
+
+    Passing in tags_to_search is optional. By default it will search through KNOWN_OPENGRAPH_TAGS constant, but for the sake of efficiency, you may want to only search for 1 or 2 tags
+
+    Returns False if page is unreadable
+    '''
+    # read the html from the page
+    response = get_request(page_url)
+
+    if response.status_code is not 200:
+        return False
+
+    # set up beautiful soup
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    # loop through the known list of opengraph tags, searching for each and appending a dictionary as we go.
+    found_tags = {}
+
+    for og_tag in tags_to_search:
+        new_found_tag = soup.find("meta",  property=og_tag)
+        if new_found_tag is not None:
+            found_tags[new_found_tag["property"]] = new_found_tag["content"]
+        elif fallback_tags is not None and og_tag in fallback_tags:
+            found_tags[og_tag] = soup.find(fallback_tags[og_tag]).text
+
+    return found_tags
diff --git a/requirements.txt b/requirements.txt
index ca9ea92e..143e0df5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,6 @@ beautifulsoup4==4.12.2
 flask-caching==2.0.2
 Pillow
 pillow-heif
-opengraph-parse==0.0.6
 feedgen==0.9.0
 celery==5.3.6
 redis==5.0.1