mirror of
https://codeberg.org/rimu/pyfedi
synced 2025-01-23 19:36:56 -08:00
improve html parsing - make plain links clickable - debug
This commit is contained in:
parent
438ac72657
commit
4e1a5efac5
1 changed files with 18 additions and 10 deletions
28
app/utils.py
28
app/utils.py
|
@ -9,7 +9,7 @@ import math
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
import flask
|
import flask
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import imghdr
|
import imghdr
|
||||||
|
@ -159,16 +159,24 @@ def allowlist_html(html: str) -> str:
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
# Find all plain text links, convert to <a> tags
|
# Find all plain text links, convert to <a> tags
|
||||||
plain_text_links = soup.find_all(text=lambda text: re.search(r'https?://\S+', text))
|
re_url = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
|
||||||
for text_link in plain_text_links:
|
for tag in soup.find_all(text=True):
|
||||||
# Create a new anchor tag
|
tags = []
|
||||||
new_anchor = soup.new_tag('a', href=text_link)
|
url = False
|
||||||
# Set the anchor's text to be the link itself
|
for t in re_url.split(tag.string):
|
||||||
new_anchor.string = text_link
|
if re_url.match(t):
|
||||||
# Replace the plain text link with the new anchor tag
|
a = soup.new_tag("a", href=t)
|
||||||
text_link.replace_with(new_anchor)
|
a.string = t
|
||||||
|
tags.append(a)
|
||||||
|
url = True
|
||||||
|
else:
|
||||||
|
tags.append(t)
|
||||||
|
if url:
|
||||||
|
for t in tags:
|
||||||
|
tag.insert_before(t)
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
# Find all tags in the parsed HTML
|
# Filter tags, leaving only safe ones
|
||||||
for tag in soup.find_all():
|
for tag in soup.find_all():
|
||||||
# If the tag is not in the allowed_tags list, remove it and its contents
|
# If the tag is not in the allowed_tags list, remove it and its contents
|
||||||
if tag.name not in allowed_tags:
|
if tag.name not in allowed_tags:
|
||||||
|
|
Loading…
Reference in a new issue