mirror of
https://codeberg.org/rimu/pyfedi
synced 2025-01-23 11:26:56 -08:00
set user-agent while parsing og:image meta tag
This commit is contained in:
parent
cb2762e15e
commit
d69dac7631
4 changed files with 56 additions and 7 deletions
|
@ -15,10 +15,9 @@ from app.constants import POST_TYPE_ARTICLE, POST_TYPE_LINK, POST_TYPE_IMAGE
|
||||||
from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \
|
from app.models import Community, File, BannedInstances, PostReply, PostVote, Post, utcnow, CommunityMember, Site, \
|
||||||
Instance, Notification, User
|
Instance, Notification, User
|
||||||
from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
|
from app.utils import get_request, gibberish, markdown_to_html, domain_from_url, allowlist_html, \
|
||||||
html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string
|
html_to_markdown, is_image_url, ensure_directory_exists, inbox_domain, post_ranking, shorten_string, parse_page
|
||||||
from sqlalchemy import desc, text
|
from sqlalchemy import desc, text
|
||||||
import os
|
import os
|
||||||
from opengraph_parse import parse_page
|
|
||||||
|
|
||||||
|
|
||||||
allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic']
|
allowed_extensions = ['.gif', '.jpg', '.jpeg', '.png', '.webp', '.heic']
|
||||||
|
@ -130,7 +129,6 @@ def actor_to_community(actor) -> Community:
|
||||||
return community
|
return community
|
||||||
|
|
||||||
|
|
||||||
@cache.memoize(timeout=50)
|
|
||||||
def opengraph_parse(url):
|
def opengraph_parse(url):
|
||||||
if '?' in url:
|
if '?' in url:
|
||||||
url = url.split('?')
|
url = url.split('?')
|
||||||
|
@ -199,8 +197,10 @@ def save_post(form, post: Post):
|
||||||
else:
|
else:
|
||||||
# check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
|
# check opengraph tags on the page and make a thumbnail if an image is available in the og:image meta tag
|
||||||
opengraph = opengraph_parse(form.link_url.data)
|
opengraph = opengraph_parse(form.link_url.data)
|
||||||
if opengraph and opengraph.get('og:image', '') != '':
|
if opengraph and (opengraph.get('og:image', '') != '' or opengraph.get('og:image:url', '') != ''):
|
||||||
filename = opengraph.get('og:image')
|
filename = opengraph.get('og:image') or opengraph.get('og:image:url')
|
||||||
|
if '?' in filename:
|
||||||
|
filename = filename.split('?')[0]
|
||||||
unused, file_extension = os.path.splitext(filename)
|
unused, file_extension = os.path.splitext(filename)
|
||||||
if file_extension.lower() in allowed_extensions:
|
if file_extension.lower() in allowed_extensions:
|
||||||
file = url_to_thumbnail_file(filename)
|
file = url_to_thumbnail_file(filename)
|
||||||
|
|
|
@ -18,7 +18,7 @@ from sqlalchemy import select, desc, text
|
||||||
from sqlalchemy_searchable import search
|
from sqlalchemy_searchable import search
|
||||||
from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \
|
from app.utils import render_template, get_setting, gibberish, request_etag_matches, return_304, blocked_domains, \
|
||||||
ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
|
ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
|
||||||
joined_communities, moderating_communities
|
joined_communities, moderating_communities, parse_page
|
||||||
from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic
|
from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
@ -249,6 +249,9 @@ def keyboard_shortcuts():
|
||||||
@bp.route('/test')
|
@bp.route('/test')
|
||||||
def test():
|
def test():
|
||||||
|
|
||||||
|
x = parse_page('https://slate.com/technology/2024/02/quora-what-happened-ai-decline.html')
|
||||||
|
return str(x)
|
||||||
|
|
||||||
return current_app.config['SERVER_NAME']
|
return current_app.config['SERVER_NAME']
|
||||||
|
|
||||||
#ip = request.headers.get('X-Forwarded-For') or request.remote_addr
|
#ip = request.headers.get('X-Forwarded-For') or request.remote_addr
|
||||||
|
|
47
app/utils.py
47
app/utils.py
|
@ -622,3 +622,50 @@ def confidence(ups, downs) -> float:
|
||||||
return 0.0
|
return 0.0
|
||||||
else:
|
else:
|
||||||
return _confidence(ups, downs)
|
return _confidence(ups, downs)
|
||||||
|
|
||||||
|
|
||||||
|
# By no means is this a complete list, but it is very easy to search for the ones you need later.
|
||||||
|
KNOWN_OPENGRAPH_TAGS = [
|
||||||
|
"og:site_name",
|
||||||
|
"og:title",
|
||||||
|
"og:locale",
|
||||||
|
"og:type",
|
||||||
|
"og:image",
|
||||||
|
"og:url",
|
||||||
|
"og:image:url",
|
||||||
|
"og:image:secure_url",
|
||||||
|
"og:image:type",
|
||||||
|
"og:image:width",
|
||||||
|
"og:image:height",
|
||||||
|
"og:image:alt",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(page_url, tags_to_search = KNOWN_OPENGRAPH_TAGS, fallback_tags = None):
|
||||||
|
'''
|
||||||
|
Parses a page, returns a JSON style dictionary of all OG tags found on that page.
|
||||||
|
|
||||||
|
Passing in tags_to_search is optional. By default it will search through KNOWN_OPENGRAPH_TAGS constant, but for the sake of efficiency, you may want to only search for 1 or 2 tags
|
||||||
|
|
||||||
|
Returns False if page is unreadable
|
||||||
|
'''
|
||||||
|
# read the html from the page
|
||||||
|
response = get_request(page_url)
|
||||||
|
|
||||||
|
if response.status_code is not 200:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# set up beautiful soup
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
# loop through the known list of opengraph tags, searching for each and appending a dictionary as we go.
|
||||||
|
found_tags = {}
|
||||||
|
|
||||||
|
for og_tag in tags_to_search:
|
||||||
|
new_found_tag = soup.find("meta", property=og_tag)
|
||||||
|
if new_found_tag is not None:
|
||||||
|
found_tags[new_found_tag["property"]] = new_found_tag["content"]
|
||||||
|
elif fallback_tags is not None and og_tag in fallback_tags:
|
||||||
|
found_tags[og_tag] = soup.find(fallback_tags[og_tag]).text
|
||||||
|
|
||||||
|
return found_tags
|
||||||
|
|
|
@ -24,7 +24,6 @@ beautifulsoup4==4.12.2
|
||||||
flask-caching==2.0.2
|
flask-caching==2.0.2
|
||||||
Pillow
|
Pillow
|
||||||
pillow-heif
|
pillow-heif
|
||||||
opengraph-parse==0.0.6
|
|
||||||
feedgen==0.9.0
|
feedgen==0.9.0
|
||||||
celery==5.3.6
|
celery==5.3.6
|
||||||
redis==5.0.1
|
redis==5.0.1
|
||||||
|
|
Loading…
Reference in a new issue