improve html parsing - make plain links clickable

2025-01-23 19:36:56 -08:00 · 2024-01-10 19:54:54 +13:00 · 2024-01-10 19:54:54 +13:00 · 438ac72657
commit 438ac72657
parent daf124ae49
3 changed files with 42 additions and 3 deletions
--- a/app/static/structure.css
+++ b/app/static/structure.css
@ -693,6 +693,19 @@ fieldset legend {
  background-color: #777;
  color: white;
  margin-bottom: 15px;
+  height: 30px;
+}
+.comment .show-more .fe-angles-down, .comment .show-more .fe-angles-up {
+  margin-top: 7px;
+  display: inline-block;
+}
+@media (min-width: 1280px) {
+  .comment .show-more {
+    height: 23px;
+  }
+  .comment .show-more .fe-angles-down, .comment .show-more .fe-angles-up {
+    display: inline;
+  }
 }
 .comment .comment_author img {
  width: 20px;
--- a/app/static/structure.scss
+++ b/app/static/structure.scss
@ -362,6 +362,19 @@ nav, etc which are used site-wide */
    background-color: $dark-grey;
    color: white;
    margin-bottom: 15px;
+    height: 30px;
+
+    .fe-angles-down, .fe-angles-up {
+      margin-top: 7px;
+      display: inline-block;
+    }
+
+    @include breakpoint(laptop) {
+      height: 23px;
+      .fe-angles-down, .fe-angles-up {
+        display: inline;
+      }
+    }
  }

  .comment_author {
--- a/app/utils.py
+++ b/app/utils.py
@ -7,7 +7,6 @@ from typing import List, Literal, Union
 import markdown2
 import math
 from urllib.parse import urlparse
-import requests
 from functools import wraps
 import flask
 from bs4 import BeautifulSoup
@ -20,6 +19,7 @@ from sqlalchemy import text
 from wtforms.fields  import SelectField, SelectMultipleField
 from wtforms.widgets import Select, html_params, ListWidget, CheckboxInput
 from app import db, cache
+import re

 from app.models import Settings, Domain, Instance, BannedInstances, User, Community, DomainBlock, ActivityPubLog, IpBan, \
    Site, Post, PostReply, utcnow
@ -158,6 +158,16 @@ def allowlist_html(html: str) -> str:
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

+    # Find all plain text links, convert to <a> tags
+    plain_text_links = soup.find_all(text=lambda text: re.search(r'https?://\S+', text))
+    for text_link in plain_text_links:
+        # Create a new anchor tag
+        new_anchor = soup.new_tag('a', href=text_link)
+        # Set the anchor's text to be the link itself
+        new_anchor.string = text_link
+        # Replace the plain text link with the new anchor tag
+        text_link.replace_with(new_anchor)
+
    # Find all tags in the parsed HTML
    for tag in soup.find_all():
        # If the tag is not in the allowed_tags list, remove it and its contents
@ -166,10 +176,13 @@ def allowlist_html(html: str) -> str:
        else:
            # Filter and sanitize attributes
            for attr in list(tag.attrs):
-                if attr not in ['href', 'src', 'alt']:  # Add allowed attributes here
+                if attr not in ['href', 'src', 'alt']:
                    del tag[attr]
+            # Add nofollow and target=_blank to anchors
+            if tag.name == 'a':
+                tag.attrs['rel'] = 'nofollow ugc'
+                tag.attrs['target'] = '_blank'

-    # Encode the HTML to prevent script execution
    return str(soup)