improve html parsing - make plain links clickable - debug

2025-01-23 19:36:56 -08:00 · 2024-01-11 08:21:33 +13:00 · 2024-01-11 08:21:33 +13:00 · 4e1a5efac5
commit 4e1a5efac5
parent 438ac72657
1 changed files with 18 additions and 10 deletions
--- a/app/utils.py
+++ b/app/utils.py
@ -9,7 +9,7 @@ import math
 from urllib.parse import urlparse
 from functools import wraps
 import flask
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 import requests
 import os
 import imghdr
@ -159,16 +159,24 @@ def allowlist_html(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    # Find all plain text links, convert to <a> tags
-    plain_text_links = soup.find_all(text=lambda text: re.search(r'https?://\S+', text))
+    re_url = re.compile(r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)')
-    for text_link in plain_text_links:
+    for tag in soup.find_all(text=True):
-        # Create a new anchor tag
+        tags = []
-        new_anchor = soup.new_tag('a', href=text_link)
+        url = False
-        # Set the anchor's text to be the link itself
+        for t in re_url.split(tag.string):
-        new_anchor.string = text_link
+            if re_url.match(t):
-        # Replace the plain text link with the new anchor tag
+                a = soup.new_tag("a", href=t)
-        text_link.replace_with(new_anchor)
+                a.string = t
                tags.append(a)
                url = True
            else:
                tags.append(t)
        if url:
            for t in tags:
                tag.insert_before(t)
            tag.extract()
-    # Find all tags in the parsed HTML
+    # Filter tags, leaving only safe ones
    for tag in soup.find_all():
        # If the tag is not in the allowed_tags list, remove it and its contents
        if tag.name not in allowed_tags: