set user-agent while parsing og:image meta tag

This commit is contained in:
rimu 2024-02-04 22:02:32 +13:00
parent cb2762e15e
commit d69dac7631
4 changed files with 56 additions and 7 deletions

View file

@ -15,10 +15,9 @@ from app.constants import POST_TYPE_ARTICLE, POST_TYPE_LINK, POST_TYPE_IMAGE
from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \ from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \
Instance, Notification, User Instance, Notification, User
from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \ from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page
from sqlalchemy import desc, text from sqlalchemy import desc, text
import os import os
from opengraph_parse import parse_page
allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic'] allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic']
@ -130,7 +129,6 @@ def actor_to_community(actor) -> Community:
return community return community
@cache.memoize(timeout=50)
def opengraph_parse(url): def opengraph_parse(url):
if '?' in url: if '?' in url:
url = url.split('?') url = url.split('?')
@ -199,8 +197,10 @@ def save_post(form, post: Post):
else: else:
# check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag # check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
opengraph = opengraph_parse(form.link_url.data) opengraph = opengraph_parse(form.link_url.data)
if opengraph and opengraph.get('og:image', '') != '': if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
filename = opengraph.get('og:image') filename = opengraph.get('og:image') or opengraph.get('og:image:url')
if '?' in filename:
filename = filename.split('?')[0]
unused, file_extension = os.path.splitext(filename) unused, file_extension = os.path.splitext(filename)
if file_extension.lower() in allowed_extensions: if file_extension.lower() in allowed_extensions:
file = url_to_thumbnail_file(filename) file = url_to_thumbnail_file(filename)

View file

@ -18,7 +18,7 @@ from sqlalchemy import select, desc, text
from sqlalchemy_searchable import search from sqlalchemy_searchable import search
from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \ from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \
ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \ ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
joined_communities, moderating_communities joined_communities, moderating_communities, parse_page
from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic
from PIL import Image from PIL import Image
import pytesseract import pytesseract
@ -249,6 +249,9 @@ def keyboard_shortcuts():
@bp.route('/test') @bp.route('/test')
def test(): def test():
x = parse_page('https://slate.com/technology/2024/02/quora-what-happened-ai-decline.html')
return str(x)
return current_app.config['SERVER_NAME'] return current_app.config['SERVER_NAME']
#ip = request.headers.get('X-Forwarded-For') or request.remote_addr #ip = request.headers.get('X-Forwarded-For') or request.remote_addr

View file

@ -622,3 +622,50 @@ def confidence(ups, downs) -> float:
return 0.0 return 0.0
else: else:
return _confidence(ups, downs) return _confidence(ups, downs)
# By no means is this a complete list, but it is very easy to search for the ones you need later.
KNOWN_OPENGRAPH_TAGS = [
"og:site_name",
"og:title",
"og:locale",
"og:type",
"og:image",
"og:url",
"og:image:url",
"og:image:secure_url",
"og:image:type",
"og:image:width",
"og:image:height",
"og:image:alt",
]
def parse_page(page_url, tags_to_search = KNOWN_OPENGRAPH_TAGS, fallback_tags = None):
'''
Parses a page, returns a JSON style dictionary of all OG tags found on that page.
Passing in tags_to_search is optional. By default it will search through KNOWN_OPENGRAPH_TAGS constant, but for the sake of efficiency, you may want to only search for 1 or 2 tags
Returns False if page is unreadable
'''
# read the html from the page
response = get_request(page_url)
if response.status_code is not 200:
return False
# set up beautiful soup
soup = BeautifulSoup(response.content, 'html.parser')
# loop through the known list of opengraph tags, searching for each and appending a dictionary as we go.
found_tags = {}
for og_tag in tags_to_search:
new_found_tag = soup.find("meta", property=og_tag)
if new_found_tag is not None:
found_tags[new_found_tag["property"]] = new_found_tag["content"]
elif fallback_tags is not None and og_tag in fallback_tags:
found_tags[og_tag] = soup.find(fallback_tags[og_tag]).text
return found_tags

View file

@ -24,7 +24,6 @@ beautifulsoup4==4.12.2
flask-caching==2.0.2 flask-caching==2.0.2
Pillow Pillow
pillow-heif pillow-heif
opengraph-parse==0.0.6
feedgen==0.9.0 feedgen==0.9.0
celery==5.3.6 celery==5.3.6
redis==5.0.1 redis==5.0.1