mirror of
https://codeberg.org/rimu/pyfedi
synced 2025-02-02 16:21:32 -08:00
cope with titles with no punctuation #185
This commit is contained in:
parent
2e0d27c4f0
commit
df6edf040b
2 changed files with 19 additions and 6 deletions
|
@ -27,7 +27,7 @@ from app.utils import render_template, get_setting, gibberish, request_etag_matc
|
|||
ap_datetime, ip_address, retrieve_block_list, shorten_string, markdown_to_text, user_filters_home, \
|
||||
joined_communities, moderating_communities, parse_page, theme_list, get_request, markdown_to_html, allowlist_html, \
|
||||
blocked_instances, communities_banned_from, topic_tree, recently_upvoted_posts, recently_downvoted_posts, \
|
||||
generate_image_from_video_url, blocked_users
|
||||
generate_image_from_video_url, blocked_users, microblog_content_to_title
|
||||
from app.models import Community, CommunityMember, Post, Site, User, utcnow, Domain, Topic, File, Instance, \
|
||||
InstanceRole, Notification, Language, community_language
|
||||
from PIL import Image
|
||||
|
@ -321,6 +321,14 @@ def list_files(directory):
|
|||
@bp.route('/test')
|
||||
def test():
|
||||
|
||||
#test_html = '<p>I'm slowly realising that I probably have some mild <a href=\"https://hachyderm.io/tags/LongCovid\" class=\"mention hashtag\" rel=\"tag\">#<span>LongCovid</span></a> </p><p>Since having covid (now had it twice since 2022): iron deficiencies, breathing problems, constant asthma, and now a sudden allergy to some foods apparently.</p><p>My partner and I have been careful throughout the pandemic but clearly not careful enough at times (twice each) since "opening up".</p><p>And though it could be far far worse, I feel pretty violated right now TBH.</p><p><span class=\"h-card\" translate=\"no\"><a href=\"https://lemmy.ml/c/coronavirus\" class=\"u-url mention\">@<span>coronavirus</span></a></span><br /><span class=\"h-card\" translate=\"no\"><a href=\"https://a.gup.pe/u/longcovid\" class=\"u-url mention\">@<span>longcovid</span></a></span></p>'
|
||||
#test_html = '<span class=\"h-card\"><a class=\"u-url mention\" data-user=\"AgYVuUCbKlLeZPIhc0\" href=\"https://lemmy.ml/c/aww\" rel=\"ugc\">@<span>aww</span></a></span> I can't contain my excitement'
|
||||
#test_html = '<p><a href=\"https://troet.cafe/tags/Garten\" class=\"mention hashtag\" rel=\"tag\">#<span>Garten</span></a> <a href=\"https://troet.cafe/tags/gardening\" class=\"mention hashtag\" rel=\"tag\">#<span>gardening</span></a> <br /><span class=\"h-card\" translate=\"no\"><a href=\"https://lemmy.world/c/gardening\" class=\"u-url mention\">@<span>gardening</span></a></span> <br /><span class=\"h-card\" translate=\"no\"><a href=\"https://woem.men/@garden\" class=\"u-url mention\">@<span>garden</span></a></span> </p><p>So - unser "Tulpenurlaub" ist vorbei, wir sind mit unserem 26PS E-Auto gut überall hin und wieder nach Hause gekommen.</p><p>Unsere 1. Station war der "Hortus Bulborum", Anlage einer gemeinnützigen Organisation, die sich der Erhaltung historischer Tulpenzwiebeln widmet. Um sie zu erhalten, müssen die Zwiebeln natürlich jedes Jahr wachsen und blühen . . .</p>'
|
||||
# test_html = '<p>Seems like <span class=\"h-card\" translate=\"no\"><a href=\"https://lemmy.ml/c/firefox\" class=\"u-url mention\">@<span>firefox</span></a></span> <span class=\"h-card\" translate=\"no\"><a href=\"https://mozilla.social/@mozilla\" class=\"u-url mention\">@<span>mozilla</span></a></span> is doing something right</p>'
|
||||
test_html = '<p><span>John Helmer: \"If Israel escalates by attacking Iran and striking at the country’s infrastructure, then Iran’s counter will be [...] Electric War.\" <br><br>Naked Capitalism commentary:<br></span><a href=\"https://www.nakedcapitalism.com/2024/04/middle-east-escalation-financial-times-revealing-account-of-israel-risk-of-ukraine-style-air-defense-attrition-helmer-on-possible-electrical-grid-campaign.html\">https://www.nakedcapitalism.com/2024/04/middle-east-escalation-financial-times-revealing-account-of-israel-risk-of-ukraine-style-air-defense-attrition-helmer-on-possible-electrical-grid-campaign.html</a><span><br><br>full article source:<br></span><a href=\"https://johnhelmer.net/loose-lips-dont-sink-ships-or-israel/\">https://johnhelmer.net/loose-lips-dont-sink-ships-or-israel/</a><span><br></span><b><span>LOOSE LIPS DON’T SINK SHIPS, OR ISRAEL</span></b><blockquote><span>[...]<br>If Israel escalates by attacking Iran and striking at the country’s infrastructure, then Iran’s counter will be to take a page out of Russia’s book and commence the one line of attack which Israel, the US and their allies cannot withstand any better than Ukraine – that’s </span><a href=\"https://web.archive.org/johnhelmer.net/?s=ELECTRIC+WAR\"><span>Electric War</span></a><span>.<br>For the seven months which have elapsed since Hamas began its operation against Israel on October 7, and Israel commenced its genocide against the Palestinians, there has been no targeting by Hamas, Hezbollah, the Houthis, or the Syrian and Iraqi groups of Israel’s highly vulnerable maritime gas platforms, gas pipelines, coal and oil-fired electricity generating plants, the coal and oil storages nearby, solar and wind power units, or the electricity grids keeping the country alight.<br>The Arab inhibitions and calculations are understandable. Iran’s will disappear if Israel triggers a new round of attacks.<br>[...]</span></blockquote><a href=\"https://lemmy.ml/c/worldnews\" class=\"u-url mention\">@worldnews@lemmy.ml</a><span> </span><a href=\"https://a.gup.pe/u/israel\" class=\"u-url mention\">@israel@a.gup.pe</a><span> </span><a href=\"https://a.gup.pe/u/iran\" class=\"u-url mention\">@iran@a.gup.pe</a><span> </span><a href=\"https://a.gup.pe/u/palestine\" class=\"u-url mention\">@palestine@a.gup.pe</a><span> </span><a href=\"https://a.gup.pe/u/imperialism\" class=\"u-url mention\">@imperialism@a.gup.pe</a><span><br></span><a href=\"https://procial.tchncs.de/tags/iran\" rel=\"tag\">#iran</a><span> </span><a href=\"https://procial.tchncs.de/tags/israel\" rel=\"tag\">#israel</a><span> </span><a href=\"https://procial.tchncs.de/tags/palestine\" rel=\"tag\">#palestine</a><span> </span><a href=\"https://procial.tchncs.de/tags/zionismisterrorism\" rel=\"tag\">#zionismisterrorism</a><span> </span><a href=\"https://procial.tchncs.de/tags/imperialism\" rel=\"tag\">#imperialism</a><span> </span><a href=\"https://procial.tchncs.de/tags/decolonization\" rel=\"tag\">#decolonization</a></p>'
|
||||
return microblog_content_to_title(test_html)
|
||||
|
||||
|
||||
md = "::: spoiler I'm all for ya having fun and your right to hurt yourself.\n\nI am a former racer, commuter, and professional Buyer for a chain of bike shops. I'm also disabled from the crash involving the 6th and 7th cars that have hit me in the last 170k+ miles of riding. I only barely survived what I simplify as a \"broken neck and back.\" Cars making U-turns are what will get you if you ride long enough, \n\nespecially commuting. It will look like just another person turning in front of you, you'll compensate like usual, and before your brain can even register what is really happening, what was your normal escape route will close and you're going to crash really hard. It is the only kind of crash that your intuition is useless against.\n:::"
|
||||
|
||||
return markdown_to_html(md)
|
||||
|
|
15
app/utils.py
15
app/utils.py
|
@ -265,10 +265,11 @@ def microblog_content_to_title(html: str) -> str:
|
|||
title = ''
|
||||
for tag in soup.find_all('p'):
|
||||
title = tag.get_text(separator=" ")
|
||||
break
|
||||
if title and title.strip() != '' and len(title.strip()) >= 5:
|
||||
break
|
||||
else:
|
||||
html = html.replace('<', '.', 1)
|
||||
title = shorten_string(html, 160)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
title = soup.get_text()
|
||||
|
||||
period_index = title.find('.')
|
||||
question_index = title.find('?')
|
||||
|
@ -279,9 +280,13 @@ def microblog_content_to_title(html: str) -> str:
|
|||
question_index if question_index != -1 else float('inf'),
|
||||
exclamation_index if exclamation_index != -1 else float('inf'))
|
||||
|
||||
# give up if there's no recognised punctuation
|
||||
# there's no recognised punctuation
|
||||
if end_index == float('inf'):
|
||||
title = '(content in post body)'
|
||||
if len(title) >= 10:
|
||||
title = title.replace(' @ ', '').replace(' # ', '')
|
||||
title = shorten_string(title, 197)
|
||||
else:
|
||||
title = '(content in post body)'
|
||||
return title
|
||||
|
||||
if end_index != -1:
|
||||
|
|
Loading…
Add table
Reference in a new issue