Handle extraneous html/text when generating titles for microblog content

This commit is contained in:
freamon 2024-05-10 15:46:16 +01:00
parent 843e9d060a
commit 424f8e004f

View file

@ -250,9 +250,10 @@ def microblog_content_to_title(html: str) -> str:
title = '' title = ''
for tag in soup.find_all('p'): for tag in soup.find_all('p'):
title = tag.get_text() title = tag.get_text(separator=" ")
break break
else: else:
html = html.replace('<', '.', 1)
title = shorten_string(html, 160) title = shorten_string(html, 160)
period_index = title.find('.') period_index = title.find('.')
@ -270,9 +271,9 @@ def microblog_content_to_title(html: str) -> str:
return title return title
if end_index != -1: if end_index != -1:
if question_index != -1: if question_index != -1 and question_index == end_index:
end_index += 1 # Add the ? back on end_index += 1 # Add the ? back on
if exclamation_index != -1: if exclamation_index != -1 and exclamation_index == end_index:
end_index += 1 # Add the ! back on end_index += 1 # Add the ! back on
title = title[:end_index] title = title[:end_index]