improve html parsing - make plain links clickable - debug

This commit is contained in:
rimu 2024-01-11 08:21:33 +13:00
parent 438ac72657
commit 4e1a5efac5

View file

@ -9,7 +9,7 @@ import math
from urllib.parse import urlparse from urllib.parse import urlparse
from functools import wraps from functools import wraps
import flask import flask
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, NavigableString
import requests import requests
import os import os
import imghdr import imghdr
@ -159,16 +159,24 @@ def allowlist_html(html: str) -> str:
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
# Find all plain text links, convert to <a> tags # Find all plain text links, convert to <a> tags
plain_text_links = soup.find_all(text=lambda text: re.search(r'https?://\S+', text)) re_url = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
for text_link in plain_text_links: for tag in soup.find_all(text=True):
# Create a new anchor tag tags = []
new_anchor = soup.new_tag('a', href=text_link) url = False
# Set the anchor's text to be the link itself for t in re_url.split(tag.string):
new_anchor.string = text_link if re_url.match(t):
# Replace the plain text link with the new anchor tag a = soup.new_tag("a", href=t)
text_link.replace_with(new_anchor) a.string = t
tags.append(a)
url = True
else:
tags.append(t)
if url:
for t in tags:
tag.insert_before(t)
tag.extract()
# Find all tags in the parsed HTML # Filter tags, leaving only safe ones
for tag in soup.find_all(): for tag in soup.find_all():
# If the tag is not in the allowed_tags list, remove it and its contents # If the tag is not in the allowed_tags list, remove it and its contents
if tag.name not in allowed_tags: if tag.name not in allowed_tags: