From 96e355bb722b0067633aad62a6851782f9c763e3 Mon Sep 17 00:00:00 2001 From: Helge Jung <hej@c3pb.de> Date: Sat, 21 Dec 2024 02:40:25 +0100 Subject: [PATCH] core: extract markdown.MyHtmlRenderer.handle_link() into utils.resolve_link() --- src/core/markdown.py | 52 ++++---------------------------------- src/core/tests/markdown.py | 1 + src/core/utils.py | 45 ++++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 48 deletions(-) diff --git a/src/core/markdown.py b/src/core/markdown.py index b6011d3c2..ad3ca6471 100644 --- a/src/core/markdown.py +++ b/src/core/markdown.py @@ -1,6 +1,5 @@ import html import re -from urllib.parse import quote, urlparse import bleach import mistletoe @@ -8,7 +7,6 @@ from mistletoe.block_token import BlockToken from mistletoe.html_renderer import HTMLRenderer from mistletoe.span_token import AutoLink, Link, SpanToken, tokenize_inner -from django.conf import settings from django.db.models import Model from django.urls import NoReverseMatch, reverse from django.utils.safestring import mark_safe @@ -17,24 +15,13 @@ from modeltranslation.fields import build_localized_fieldname from modeltranslation.settings import AVAILABLE_LANGUAGES from .models import conference -from .utils import scheme_and_netloc_from_url, url_in_allowlist +from .utils import resolve_link def markdown_header_slugify(value: str, separator: str) -> str: return 'md-' + slugify(value) -def is_trusted_dst(url: str): - url = urlparse(url) - is_local_domain = url.netloc in settings.ALLOWED_HOSTS - is_external = (url.scheme or url.netloc) and not is_local_domain - return not is_external or url.scheme not in {'http', 'https', 'ftp', 'ftps'} - - -def redirect_via_dereferer(url: str): - return settings.PLAINUI_DEREFERER_URL.format(quoted_target=quote(url)) - - class PageLink(SpanToken): pattern = re.compile(r'\[\[ *([^|\]]+?) *(?:\| *(.*))? *\]\]') parse_group = 2 @@ -109,46 +96,17 @@ class MyHtmlRenderer(HTMLRenderer): result += '</div>\n' return result - def __init__(self, conf: 'conference.Conference', result: 'RenderResult', *extras, derefer_allowlist: bool = True, **kwargs): + def __init__(self, conf: 'conference.Conference', result: 'RenderResult', *extras, use_derefer_allowlist: bool = True, **kwargs): self.conf = conf self.result = result - self.derefer_allowlist = derefer_allowlist + self.use_derefer_allowlist = use_derefer_allowlist super().__init__(PageLink, ProfileLink, Tag, AlertBlock, *extras, **kwargs) - def derive_link_target(self, url): - """rewrite given URL unless it is trusted or in dereferrer-allowlist while those shall not be dereferred""" - - do_derefer = True - if not self.derefer_allowlist: - try: - scheme_and_netloc = scheme_and_netloc_from_url(url) - if url_in_allowlist(scheme_and_netloc, settings.DEREFERRER_GLOBAL_ALLOWLIST): - do_derefer = False - except ValueError: - # ignore URL parsing error - pass - - return redirect_via_dereferer(url) if do_derefer else url - - def handle_link(self, url: str) -> tuple[str, str]: - from .utils import resolve_internal_url - - # attempt resolving an internal URL - if resolved_internal_url := resolve_internal_url(url, accept_http_https=False, fallback_as_is=False): - url = resolved_internal_url - - # derive external link (i.e. apply dereferer), if its not an internal or trusted location - if resolved_internal_url is None and not is_trusted_dst(url): - return 'external', self.derive_link_target(url) - - # otherwise, it's an internal link - return 'internal', url - def render_link(self, token: Link) -> str: if token.target.startswith(('javascript:', 'data:')): token.target = '' - link_type, url = self.handle_link(token.target) + link_type, url = resolve_link(token.target, self.use_derefer_allowlist) self.result.linked_urls.add(url) template = '<a href="{target}"{title} class="{link_type}">{inner}</a>' @@ -269,7 +227,7 @@ def render_markdown_ex( result = RenderResult() with renderer(conf, result) as renderer: - renderer.derefer_allowlist = not dont_derefer_allowlist + renderer.use_derefer_allowlist = dont_derefer_allowlist rendered_markup = renderer.render(mistletoe.Document(markup)) if sanitize_html: diff --git a/src/core/tests/markdown.py b/src/core/tests/markdown.py index d30892ab8..0ab5b990e 100644 --- a/src/core/tests/markdown.py +++ b/src/core/tests/markdown.py @@ -19,6 +19,7 @@ class MarkdownTest(TestCase): conf = Conference(name='foo', id=TEST_CONF_ID) conf.save() + # TODO: consider moving this test into tests/utils.py as this is basically testing resolve_link() only (except the footnote/anchored links part) tests = [ ('https://localhost/', False), ('https://localhost/foo', False), diff --git a/src/core/utils.py b/src/core/utils.py index cd3c151c2..a96d9e4bd 100644 --- a/src/core/utils.py +++ b/src/core/utils.py @@ -8,7 +8,7 @@ import uuid from datetime import UTC, datetime, timedelta from pathlib import Path from string import ascii_letters, digits -from urllib.parse import parse_qs, urlparse, urlunparse +from urllib.parse import parse_qs, quote, urlparse, urlunparse import requests @@ -213,6 +213,49 @@ def resolve_internal_url(url: str, accept_http_https: bool = True, fallback_as_i return url if fallback_as_is else None +def get_dereferred_url(url: str, use_derefer_allowlist: bool = True): + """rewrite given URL unless it is trusted or in dereferrer-allowlist while those shall not be dereferred""" + + do_derefer = True + if use_derefer_allowlist: + try: + scheme_and_netloc = scheme_and_netloc_from_url(url) + if url_in_allowlist(scheme_and_netloc, settings.DEREFERRER_GLOBAL_ALLOWLIST): + do_derefer = False + except ValueError: + # ignore URL parsing error + pass + + return settings.PLAINUI_DEREFERER_URL.format(quoted_target=quote(url)) if do_derefer else url + + +def is_trusted_link_destination(url: str): + url = urlparse(url) + is_local_domain = url.netloc in settings.ALLOWED_HOSTS + is_external = (url.scheme or url.netloc) and not is_local_domain + return not is_external or url.scheme not in {'http', 'https', 'ftp', 'ftps'} + + +def resolve_link(url: str, use_derefer_allowlist: bool = True) -> tuple[str, str]: + """ + Resolves a given URL, classifies it as internal or external and optionally rewrites it to use the dereferrer. + :param url: the original URL to resolve + :param use_derefer_allowlist: controls if the global allowlist (see settings.DEREFERRER_GLOBAL_ALLOWLIST) shall be used + :return: tuple with two values, the first being either 'internal' or 'external', the second being the resolved link (might be to the dereferrer) + """ + + # attempt resolving an internal URL + if resolved_internal_url := resolve_internal_url(url, accept_http_https=False, fallback_as_is=False): + url = resolved_internal_url + + # derive external link (i.e. apply dereferer), if its not an internal or trusted location + if resolved_internal_url is None and not is_trusted_link_destination(url): + return 'external', get_dereferred_url(url, use_derefer_allowlist=use_derefer_allowlist) + + # otherwise, it's an internal link + return 'internal', url + + def download_from_url(url: str) -> tuple[str, bytes]: # let requests library fetch the URL r = requests.get(url, timeout=30) -- GitLab