format_links: refactor; support multiple punctuation

This commit is contained in:
Adeodato Simó 2023-10-09 21:41:22 -03:00
parent 17d741039c
commit 294788aa1a
No known key found for this signature in database
GPG key ID: CDF447845F1A986F
2 changed files with 28 additions and 47 deletions

View file

@ -427,6 +427,10 @@ http://www.fish.com/"""
views.status.format_links(f"{url}."), views.status.format_links(f"{url}."),
f'<a href="{url}">www.fish.com/</a>.', f'<a href="{url}">www.fish.com/</a>.',
) )
self.assertEqual(
views.status.format_links(f"{url}!?!"),
f'<a href="{url}">www.fish.com/</a>!?!',
)
def test_format_links_punctuation_parens(self, *_): def test_format_links_punctuation_parens(self, *_):
"""ignore trailing punctuation and brackets combined""" """ignore trailing punctuation and brackets combined"""

View file

@ -1,7 +1,6 @@
""" what are we here for if not for posting """ """ what are we here for if not for posting """
import re import re
import logging import logging
from urllib.parse import urlparse
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.core.validators import URLValidator from django.core.validators import URLValidator
@ -297,67 +296,45 @@ def find_or_create_hashtags(content):
def format_links(content): def format_links(content):
"""detect and format links""" """detect and format links"""
validator = URLValidator() validator = URLValidator(["http", "https"])
formatted_content = "" schema_re = re.compile(r"\bhttps?://")
split_content = re.split(r"(\s+)", content) split_content = re.split(r"(\s+)", content)
for potential_link in split_content: for i, potential_link in enumerate(split_content):
if not potential_link: if not schema_re.search(potential_link):
continue continue
# FIXME: allow for multiple punctuation characters, e.g. `...` and `!?`. # Strip surrounding brackets and trailing punctuation.
ends_with_punctuation = _ends_with_punctuation(potential_link) prefix, potential_link, suffix = _unwrap(potential_link)
if ends_with_punctuation:
punctuation_glyph = potential_link[-1]
potential_link = potential_link[0:-1]
wrapped = _wrapped(potential_link)
if wrapped:
wrapper_close = potential_link[-1]
formatted_content += potential_link[0]
potential_link = potential_link[1:-1]
try: try:
# raises an error on anything that's not a valid link # raises an error on anything that's not a valid link
validator(potential_link) validator(potential_link)
# use everything but the scheme in the presentation of the link # use everything but the scheme in the presentation of the link
url = urlparse(potential_link) link = schema_re.sub("", potential_link)
link = url.netloc + url.path + url.params split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
if url.query != "":
link += "?" + url.query
if url.fragment != "":
link += "#" + url.fragment
formatted_content += f'<a href="{potential_link}">{link}</a>'
except (ValidationError, UnicodeError): except (ValidationError, UnicodeError):
formatted_content += potential_link pass
if wrapped: return "".join(split_content)
formatted_content += wrapper_close
if ends_with_punctuation:
formatted_content += punctuation_glyph
return formatted_content
def _wrapped(text): def _unwrap(text):
"""check if a line of text is wrapped""" """split surrounding brackets and trailing punctuation from a string of text"""
wrappers = ["()", "[]", "{}"] punct = re.compile(r'([.,;:!?"’”»]+)\Z')
for wrapper in wrappers: prefix = suffix = ""
if punct.search(text):
# Move punctuation to suffix segment.
text, suffix, _ = punct.split(text)
for wrapper in ("()", "[]", "{}"):
if text[0] == wrapper[0] and text[-1] == wrapper[-1]: if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
return True # Split out wrapping chars.
return False suffix = text[-1] + suffix
prefix, text = text[:1], text[1:-1]
return prefix, text, suffix
def _ends_with_punctuation(text):
"""check if a line of text ends with a punctuation glyph"""
glyphs = [".", ",", ";", ":", "!", "?", "", "", '"', "»"]
for glyph in glyphs:
if text[-1] == glyph:
return True
return False
def to_markdown(content): def to_markdown(content):