From d69dac7631430d834b4070687a666d7842365721 Mon Sep 17 00:00:00 2001 From: rimu <3310831+rimu@users.noreply.github.com> Date: Sun, 4 Feb 2024 22:02:32 +1300 Subject: [PATCH] set user-agent while parsing og:image meta tag --- app/community/util.py | 10 ++++----- app/main/routes.py | 5 ++++- app/utils.py | 47 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 - 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/app/community/util.py b/app/community/util.py index 57122abe..9ba37184 100644 --- a/app/community/util.py +++ b/app/community/util.py @@ -15,10 +15,9 @@ from app.constants import POST_TYPE_ARTICLE, POST_TYPE_LINK, POST_TYPE_IMAGE from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \ Instance, Notification, User from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \ - html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string + html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page from sqlalchemy import desc, text import os -from opengraph_parse import parse_page allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic'] @@ -130,7 +129,6 @@ def actor_to_community(actor) -> Community: return community -@cache.memoize(timeout=50) def opengraph_parse(url): if '?' in url: url = url.split('?') @@ -199,8 +197,10 @@ def save_post(form, post: Post): else: # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag opengraph = opengraph_parse(form.link_url.data) - if opengraph and opengraph.get('og:image', '') != '': - filename = opengraph.get('og:image') + if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''): + filename = opengraph.get('og:image') or opengraph.get('og:image:url') + if '?' in filename: + filename = filename.split('?')[0] unused, file_extension = os.path.splitext(filename) if file_extension.lower() in allowed_extensions: file = url_to_thumbnail_file(filename) diff --git a/app/main/routes.py b/app/main/routes.py index 8f2e8517..6a2d82d2 100644 --- a/app/main/routes.py +++ b/app/main/routes.py @@ -18,7 +18,7 @@ from sqlalchemy import select, desc, text from sqlalchemy_searchable import search from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \ ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \ - joined_communities, moderating_communities + joined_communities, moderating_communities, parse_page from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic from PIL import Image import pytesseract @@ -249,6 +249,9 @@ def keyboard_shortcuts(): @bp.route('/test') def test(): + x = parse_page('https://slate.com/technology/2024/02/quora-what-happened-ai-decline.html') + return str(x) + return current_app.config['SERVER_NAME'] #ip = request.headers.get('X-Forwarded-For') or request.remote_addr diff --git a/app/utils.py b/app/utils.py index eb008b15..71228e10 100644 --- a/app/utils.py +++ b/app/utils.py @@ -622,3 +622,50 @@ def confidence(ups, downs) -> float: return 0.0 else: return _confidence(ups, downs) + + +# By no means is this a complete list, but it is very easy to search for the ones you need later. +KNOWN_OPENGRAPH_TAGS = [ + "og:site_name", + "og:title", + "og:locale", + "og:type", + "og:image", + "og:url", + "og:image:url", + "og:image:secure_url", + "og:image:type", + "og:image:width", + "og:image:height", + "og:image:alt", + ] + + +def parse_page(page_url, tags_to_search = KNOWN_OPENGRAPH_TAGS, fallback_tags = None): + ''' + Parses a page, returns a JSON style dictionary of all OG tags found on that page. + + Passing in tags_to_search is optional. By default it will search through KNOWN_OPENGRAPH_TAGS constant, but for the sake of efficiency, you may want to only search for 1 or 2 tags + + Returns False if page is unreadable + ''' + # read the html from the page + response = get_request(page_url) + + if response.status_code is not 200: + return False + + # set up beautiful soup + soup = BeautifulSoup(response.content, 'html.parser') + + # loop through the known list of opengraph tags, searching for each and appending a dictionary as we go. + found_tags = {} + + for og_tag in tags_to_search: + new_found_tag = soup.find("meta", property=og_tag) + if new_found_tag is not None: + found_tags[new_found_tag["property"]] = new_found_tag["content"] + elif fallback_tags is not None and og_tag in fallback_tags: + found_tags[og_tag] = soup.find(fallback_tags[og_tag]).text + + return found_tags diff --git a/requirements.txt b/requirements.txt index ca9ea92e..143e0df5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,6 @@ beautifulsoup4==4.12.2 flask-caching==2.0.2 Pillow pillow-heif -opengraph-parse==0.0.6 feedgen==0.9.0 celery==5.3.6 redis==5.0.1