add bypass paywall links for several sites #205

This commit is contained in:
rimu 2024-06-16 19:03:23 +08:00
parent 3f2690a42b
commit 811c9eb5f3
2 changed files with 7 additions and 4 deletions

View file

@ -16,7 +16,7 @@ from app.inoculation import inoculation
from app.post.forms import NewReplyForm, ReportPostForm, MeaCulpaForm
from app.community.forms import CreateLinkForm, CreateImageForm, CreateDiscussionForm, CreateVideoForm, CreatePollForm
from app.post.util import post_replies, get_comment_branch, post_reply_count, tags_to_string, url_has_paywall, \
generate_paywall_bypass_link
generate_paywall_bypass_link, body_has_no_paywall_link
from app.constants import SUBSCRIPTION_MEMBER, SUBSCRIPTION_OWNER, SUBSCRIPTION_MODERATOR, POST_TYPE_LINK, \
POST_TYPE_IMAGE, \
POST_TYPE_ARTICLE, POST_TYPE_VIDEO, NOTIF_REPLY, NOTIF_POST, POST_TYPE_POLL
@ -302,7 +302,7 @@ def show_post(post_id: int):
# Bypass paywalls link
bypass_paywall_link = None
if post.type == POST_TYPE_LINK and 'https://archive.' not in post.body_html and url_has_paywall(post.url):
if post.type == POST_TYPE_LINK and body_has_no_paywall_link(post.body_html) and url_has_paywall(post.url):
bypass_paywall_link = generate_paywall_bypass_link(post.url)
response = render_template('post/post.html', title=post.title, post=post, is_moderator=is_moderator, community=post.community,

View file

@ -81,9 +81,13 @@ def tags_to_string(post: Post) -> str:
return ', '.join([tag.name for tag in post.tags])
def body_has_no_paywall_link(body):
return 'https://archive.' not in body and 'https://12ft.io' not in body
def url_has_paywall(url) -> bool:
paywalled_sites = ['washingtonpost.com', 'wapo.st', 'nytimes.com', 'wsj.com', 'economist.com', 'ft.com', 'telegraph.co.uk',
'bild.de', 'theatlantic.com', 'lemonde.fr']
'bild.de', 'theatlantic.com', 'lemonde.fr', 'nzherald.co.nz']
if url:
try:
parsed_url = urlparse(url.replace('www.', ''))
@ -96,5 +100,4 @@ def url_has_paywall(url) -> bool:
def generate_paywall_bypass_link(url) -> bool:
url_without_protocol = url.replace('https://', '').replace('http://', '')
return 'https://archive.ph/' + url