From e41d61cfaf4b11324df05c19bd475a52cdacabc7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 7 May 2026 14:18:52 +0300 Subject: [PATCH 1/2] gh-149489: Fix ElementTree serialization to HTML * The content of comments, processing instructions and elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer escaped. * The "plaintext" element no longer have the closing tag. * Add support of empty attributes (with value None). --- Lib/test/test_xml_etree.py | 29 ++++++++++++++++++- Lib/xml/etree/ElementTree.py | 24 +++++++++------ ...-05-07-14-18-47.gh-issue-149489.bX9iHe.rst | 5 ++++ 3 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 8f3efe9fc90794..b820845f3b63e2 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1278,7 +1278,13 @@ def check(p, expected, namespaces=None): {'': 'http://www.w3.org/2001/XMLSchema', 'ns': 'http://www.w3.org/2001/XMLSchema'}) - def test_processinginstruction(self): + def test_comment_serialization(self): + comm = ET.Comment(' & ham') + self.assertEqual(ET.tostring(comm), b'') + self.assertEqual(ET.tostring(comm, method='html'), b'') + self.assertEqual(ET.tostring(comm, method='text'), b' & ham') + + def test_processinginstruction_serialization(self): # Test ProcessingInstruction directly self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')), @@ -1293,6 +1299,21 @@ def test_processinginstruction(self): self.assertEqual(ET.tostring(ET.PI('test', '\xe3'), 'latin-1'), b"\n" b"\xe3?>") + self.assertEqual(ET.tostring(ET.PI('test', 'ham & eggs < spam'), method='html'), + b'') + + def test_empty_attribute_serialization(self): + elem = ET.Element('tag', attrib={'attr': None}) + self.assertRaises(TypeError, ET.tostring, elem) + self.assertEqual(ET.tostring(elem, method='html'), b'') + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_html_cdata_elems_serialization(self, tag): + tag = tag.title() + elem = ET.Element(tag) + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + ('<%s>&ham' % (tag, tag)).encode()) def test_html_empty_elems_serialization(self): # issue 15970 @@ -1308,6 +1329,12 @@ def test_html_empty_elems_serialization(self): method='html') self.assertEqual(serialized, expected) + def test_html_plaintext_serialization(self): + elem = ET.Element('PlainText') + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + b'<spam>&ham') + def test_dump_attribute_order(self): # See BPO 34160 e = ET.Element('cirriculum', status='public', company='example') diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 85766e02b531ce..7b14ec360d7cf7 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -907,17 +907,20 @@ def _serialize_xml(write, elem, qnames, namespaces, if elem.tail: write(_escape_cdata(elem.tail)) +_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed", + "noframes", "plaintext"} + HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", "img", "input", "isindex", "link", "meta", "param", "source", - "track", "wbr"} + "track", "wbr", "plaintext"} def _serialize_html(write, elem, qnames, namespaces, **kwargs): tag = elem.tag text = elem.text if tag is Comment: - write("<!--%s-->" % _escape_cdata(text)) + write("<!--%s-->" % text) elif tag is ProcessingInstruction: - write("<?%s?>" % _escape_cdata(text)) + write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: @@ -941,16 +944,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): for k, v in items: if isinstance(k, QName): k = k.text - if isinstance(v, QName): - v = qnames[v.text] + k = qnames[k] + if v is None: + write(" %s" % k) else: - v = _escape_attrib_html(v) - # FIXME: handle boolean attributes - write(" %s=\"%s\"" % (qnames[k], v)) + if isinstance(v, QName): + v = qnames[v.text] + else: + v = _escape_attrib_html(v) + write(" %s=\"%s\"" % (k, v)) write(">") ltag = tag.lower() if text: - if ltag == "script" or ltag == "style": + if ltag in _CDATA_CONTENT_ELEMENTS: write(text) else: write(_escape_cdata(text)) diff --git a/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst new file mode 100644 index 00000000000000..1550c893fd7c45 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst @@ -0,0 +1,5 @@ +Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of +comments, processing instructions and elements "xmp", "iframe", "noembed", +"noframes", and "plaintext" is no longer escaped. The "plaintext" element no +longer have the closing tag. Add support of empty attributes (with value +``None``). From a134c0b83ab6a612d44f7875efda7bb9f4625547 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Wed, 6 May 2026 22:23:29 +0300 Subject: [PATCH 2/2] gh-149468: Add option to validate ElementTree during serialization --- Doc/library/xml.etree.elementtree.rst | 30 ++- Doc/whatsnew/3.15.rst | 11 ++ Lib/test/test_xml_etree.py | 186 ++++++++++++++++++ Lib/xml/etree/ElementTree.py | 111 +++++++++-- ...-05-06-22-22-05.gh-issue-149468.IUSCzU.rst | 3 + 5 files changed, 321 insertions(+), 20 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index 310ccd651e18c7..b8c8b8f3c009ec 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -711,14 +711,14 @@ Functions .. function:: tostring(element, encoding="us-ascii", method="xml", *, \ xml_declaration=None, default_namespace=None, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same meaning as in :meth:`ElementTree.write`. Returns an (optionally) encoded string containing the XML data. @@ -732,17 +732,20 @@ Functions The :func:`tostring` function now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + .. function:: tostringlist(element, encoding="us-ascii", method="xml", *, \ xml_declaration=None, default_namespace=None, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same meaning as in :meth:`ElementTree.write`. Returns a list of (optionally) encoded strings containing the XML data. It does not guarantee any specific sequence, except that ``b"".join(tostringlist(element)) == tostring(element)``. @@ -752,6 +755,9 @@ Functions .. versionchanged:: 3.4 Added the *short_empty_elements* parameter. + .. versionchanged:: next + Added the *validate* parameter. + .. versionchanged:: 3.8 Added the *xml_declaration* and *default_namespace* parameters. @@ -759,6 +765,9 @@ Functions The :func:`tostringlist` function now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + .. function:: XML(text, parser=None) @@ -1186,7 +1195,7 @@ ElementTree Objects .. method:: write(file, encoding="us-ascii", xml_declaration=None, \ default_namespace=None, method="xml", *, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Writes the element tree to a file, as XML. *file* is a file name, or a :term:`file object` opened for writing. *encoding* [1]_ is the output @@ -1197,6 +1206,14 @@ ElementTree Objects *default_namespace* sets the default XML namespace (for "xmlns"). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). + + If *validate* is true, check that all characters are legal XML or HTML + characters, depending on *method*, element and attribute names are + valid, and the content of comments, processing instructions and + HTML elements like ``<script>`` do not contain illegal sequences, + and raise :exc:`ValueError` otherwise. + By default, no validation is performed. + The keyword-only *short_empty_elements* parameter controls the formatting of elements that contain no content. If ``True`` (the default), they are emitted as a single self-closed tag, otherwise they are emitted as a pair @@ -1216,6 +1233,9 @@ ElementTree Objects The :meth:`write` method now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + This is the XML file that is going to be manipulated:: diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 9e2f789334ff02..3a711c1a2cfedc 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1813,6 +1813,17 @@ xml (Contributed by Serhiy Storchaka in :gh:`139489`.) +xml.etree.ElementTree +--------------------- + +* Add the *validate* option to functions + :func:`~xml.etree.ElementTree.tostring`, + :func:`~xml.etree.ElementTree.tostringlist`, and the + :meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method, + which allows to validate the element or element tree before serialization. + (Contributed by Serhiy Storchaka in :gh:`xxxxxx`.) + + xml.parsers.expat ----------------- diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index b820845f3b63e2..55b86769af128d 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1358,6 +1358,192 @@ def test_attlist_default(self): {'{http://www.w3.org/XML/1998/namespace}lang': 'eng'}) +class XMLValidationTest(unittest.TestCase): + + def check(self, elem, expected=None): + self.assertRaises(ValueError, + ET.tostring, elem, validate=True) + ET.tostring(elem) # no exception + + def test_invalid_comment(self): + self.check(ET.Comment('a--b')) + self.check(ET.Comment(' B+, B, or B-')) + + def test_invalid_processing_instruction(self): + self.check(ET.PI('')) + self.check(ET.PI('0')) + self.check(ET.PI('a/b')) + self.check(ET.PI('foo\xa0bar')) + self.check(ET.PI('xml')) + self.check(ET.PI('xml', 'encoding="UTF-8"')) + self.check(ET.PI('foo', 'a?>b')) + self.check(ET.PI('foo', '\x00')) + self.check(ET.PI('foo', '\ud8ff')) + self.check(ET.PI('foo', '\ufffe')) + + def test_invalid_tag(self): + self.check(ET.Element('')) + self.check(ET.Element('0')) + self.check(ET.Element('a/b')) + self.check(ET.Element(ET.QName(''))) + self.check(ET.Element(ET.QName('0'))) + self.check(ET.Element(ET.QName('a/b'))) + + def test_invalid_attr_name(self): + self.check(ET.Element('tag', attrib={'': 'value'})) + self.check(ET.Element('tag', attrib={'0': 'value'})) + self.check(ET.Element('tag', attrib={'a/b': 'value'})) + self.check(ET.Element('tag', attrib={ET.QName(''): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('0'): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'})) + + def test_invalid_attr_value(self): + self.check(ET.Element('tag', attrib={'key': '\x00'})) + self.check(ET.Element('tag', attrib={'key': '\ud8ff'})) + self.check(ET.Element('tag', attrib={'key': '\ufffe'})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\ufffe')})) + + def test_invalid_text(self): + elem = ET.Element('tag') + elem.text = '\x00' + self.check(elem) + elem.text = '\ud8ff' + self.check(elem) + elem.text = '\ufffe' + self.check(elem) + + def test_invalid_tail(self): + elem = ET.Element('tag') + elem.tail = '\x00' + self.check(elem) + elem.tail = '\ud8ff' + self.check(elem) + elem.tail = '\ufffe' + self.check(elem) + + def test_invalid_text_without_tag(self): + elem = ET.Element(None) + elem.text = '\x00' + self.check(elem) + elem.text = '\ud8ff' + self.check(elem) + elem.text = '\ufffe' + self.check(elem) + + def test_invalid_subelements(self): + elem = ET.Element('tag') + subelem = ET.SubElement(elem, 'subtag') + ET.SubElement(subelem, '\x00') + self.check(elem) + elem.tag = None + self.check(elem) + + def test_invalid_namespace_uri(self): + self.check(ET.Element('{\x00}tag')) + self.check(ET.Element('{\ud8ff}tag')) + self.check(ET.Element('{\ufffe}tag')) + self.check(ET.Element(ET.QName('\x00', 'tag'))) + self.check(ET.Element(ET.QName('\ud8ff', 'tag'))) + self.check(ET.Element(ET.QName('\ufffe', 'tag'))) + +class HTMLValidationTest(unittest.TestCase): + + def check(self, elem, expected=None): + self.assertRaises(ValueError, + ET.tostring, elem, method='html', validate=True) + ET.tostring(elem, method='html') # no exception + + def test_invalid_comment(self): + self.check(ET.Comment('>')) + self.check(ET.Comment('->')) + self.check(ET.Comment('a-->b')) + self.check(ET.Comment('a--!>b')) + self.check(ET.Comment('a\x00b')) + + def test_invalid_processing_instruction(self): + self.check(ET.PI('a>b')) + self.check(ET.PI('a\x00b')) + + def test_invalid_tag(self): + self.check(ET.Element('')) + self.check(ET.Element('?')) + self.check(ET.Element('!')) + self.check(ET.Element('0')) + self.check(ET.Element(' a')) + self.check(ET.Element('a b')) + self.check(ET.Element('a\nb')) + self.check(ET.Element('a/b')) + self.check(ET.Element('a>b')) + self.check(ET.Element('a\x00b')) + self.check(ET.Element(ET.QName(''))) + self.check(ET.Element(ET.QName('0'))) + self.check(ET.Element(ET.QName('a/b'))) + + def test_invalid_attr_name(self): + self.check(ET.Element('tag', attrib={'': 'value'})) + self.check(ET.Element('tag', attrib={'a/b': 'value'})) + self.check(ET.Element('tag', attrib={'a=b': 'value'})) + self.check(ET.Element('tag', attrib={ET.QName(''): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'})) + + def test_invalid_attr_value(self): + self.check(ET.Element('tag', attrib={'key': '\x00'})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')})) + + def test_invalid_text(self): + elem = ET.Element('tag') + elem.text = '\x00' + self.check(elem) + + def test_invalid_tail(self): + elem = ET.Element('tag') + elem.tail = '\x00' + self.check(elem) + + def test_invalid_text_without_tag(self): + elem = ET.Element(None) + elem.text = '\x00' + self.check(elem) + + def test_invalid_subelements(self): + elem = ET.Element('tag') + subelem = ET.SubElement(elem, 'subtag') + ET.SubElement(subelem, '\x00') + self.check(elem) + elem.tag = None + self.check(elem) + + def test_invalid_namespace_uri(self): + self.check(ET.Element('{\x00}tag')) + self.check(ET.Element(ET.QName('\x00', 'tag'))) + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_invalid_cdata_content(self, tag): + elem = ET.Element(tag.upper()) + elem.text = 'a</%s>b' % tag.title() + self.check(elem) + elem.text = 'a</%s b' % tag.title() + self.check(elem) + elem.text = 'a</%s/b' % tag.title() + self.check(elem) + elem.text = 'a\x00b' + self.check(elem) + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_cdata_subelements(self, tag): + elem = ET.Element(tag) + ET.SubElement(elem, 'subtag') + self.check(elem) + + def test_invalid_plaintext_content(self): + elem = ET.Element('plaintext') + elem.text = 'a\x00b' + self.check(elem) + class IterparseTest(unittest.TestCase): # Test iterparse interface. diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 7b14ec360d7cf7..6faf348aacf01b 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -99,6 +99,7 @@ import weakref from . import ElementPath +from .. import is_valid_name, is_valid_text class ParseError(SyntaxError): @@ -689,6 +690,7 @@ def write(self, file_or_filename, xml_declaration=None, default_namespace=None, method=None, *, + validate=False, short_empty_elements=True): """Write element tree to a file as XML. @@ -706,6 +708,8 @@ def write(self, file_or_filename, *method* -- either "xml" (default), "html, "text", or "c14n" + *validate* -- if true, validate the content + *short_empty_elements* -- controls the formatting of elements that contain no content. If True (default) they are emitted as a single self-closed @@ -737,6 +741,7 @@ def write(self, file_or_filename, qnames, namespaces = _namespaces(self._root, default_namespace) serialize = _serialize[method] serialize(write, self._root, qnames, namespaces, + validate=validate, short_empty_elements=short_empty_elements) def write_c14n(self, file): @@ -857,23 +862,39 @@ def add_qname(qname): add_qname(text.text) return qnames, namespaces -def _serialize_xml(write, elem, qnames, namespaces, - short_empty_elements, **kwargs): +def _serialize_xml(write, elem, qnames, namespaces, *, + validate, short_empty_elements, **kwargs): tag = elem.tag text = elem.text if tag is Comment: + if validate: + if '--' in text or text.endswith('-'): + raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: + if validate: + m = re.search('[ \t\r\n]', text) + if m is not None: + target = text[:m.start()] + else: + target = text + if (not is_valid_name(target) or target.lower() == 'xml' + or '?>' in text or not is_valid_text(text)): + raise ValueError('invalid processing instruction') write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: if text: - write(_escape_cdata(text)) + write(_escape_cdata(text, validate)) for e in elem: _serialize_xml(write, e, qnames, None, + validate=validate, short_empty_elements=short_empty_elements) else: + if validate: + if not is_valid_name(tag): + raise ValueError('invalid element name') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -882,30 +903,40 @@ def _serialize_xml(write, elem, qnames, namespaces, key=lambda x: x[1]): # sort on prefix if k: k = ":" + k + if validate: + if not is_valid_name(k): + raise ValueError('invalid namespace name') write(" xmlns%s=\"%s\"" % ( k, - _escape_attrib(v) + _escape_attrib(v, validate) )) for k, v in items: if isinstance(k, QName): k = k.text + if validate: + if not is_valid_name(qnames[k]): + raise ValueError('invalid attribute name') if isinstance(v, QName): v = qnames[v.text] + if validate: + if not is_valid_name(v): + raise ValueError('invalid attribute value') else: - v = _escape_attrib(v) + v = _escape_attrib(v, validate) write(" %s=\"%s\"" % (qnames[k], v)) if text or len(elem) or not short_empty_elements: write(">") if text: - write(_escape_cdata(text)) + write(_escape_cdata(text, validate)) for e in elem: _serialize_xml(write, e, qnames, None, + validate=validate, short_empty_elements=short_empty_elements) write("</" + tag + ">") else: write(" />") if elem.tail: - write(_escape_cdata(elem.tail)) + write(_escape_cdata(elem.tail, validate)) _CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed", "noframes", "plaintext"} @@ -914,21 +945,34 @@ def _serialize_xml(write, elem, qnames, namespaces, "img", "input", "isindex", "link", "meta", "param", "source", "track", "wbr", "plaintext"} -def _serialize_html(write, elem, qnames, namespaces, **kwargs): +def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs): tag = elem.tag text = elem.text if tag is Comment: + if validate: + if (re.prefixmatch('-?>', text) or re.search('--!?>', text) + or '\0' in text): + raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: + if validate: + if '>' in text or '\0' in text: + raise ValueError('invalid processing instruction') write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: if text: + if validate: + if '\0' in text: + raise ValueError('invalid characters') write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, qnames, None) + _serialize_html(write, e, qnames, None, validate=validate) else: + if validate: + if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />]*+', tag): + raise ValueError('invalid element name') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -937,6 +981,12 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): key=lambda x: x[1]): # sort on prefix if k: k = ":" + k + if validate: + if not re.fullmatch('[^\0\t\n\r\f />=]++', k): + raise ValueError('invalid attribute name') + if validate: + if '\0' in v: + raise ValueError('invalid characters') write(" xmlns%s=\"%s\"" % ( k, _escape_attrib(v) @@ -945,26 +995,49 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): if isinstance(k, QName): k = k.text k = qnames[k] + if validate: + if not re.fullmatch('[^\0\t\n\r\f />][^\0\t\n\r\f />=]*+', k): + raise ValueError('invalid attribute name') if v is None: - write(" %s" % k) + write(" %s" % (k,)) else: if isinstance(v, QName): v = qnames[v.text] + if validate: + if '\0' in v or '"' in v or '&' in v: + raise ValueError('invalid attribute value') else: + if validate: + if '\0' in v: + raise ValueError('invalid attribute value') v = _escape_attrib_html(v) write(" %s=\"%s\"" % (k, v)) write(">") ltag = tag.lower() if text: + if validate: + if '\0' in text: + raise ValueError('invalid characters') if ltag in _CDATA_CONTENT_ELEMENTS: + if validate: + if (ltag != "plaintext" + and re.search(r'</%s(?=[\t\n\r\f />])' % ltag, + text, re.IGNORECASE|re.ASCII)): + raise ValueError('invalid %s content' % ltag) write(text) else: write(_escape_cdata(text)) + if validate: + if ltag in _CDATA_CONTENT_ELEMENTS and len(elem): + raise ValueError('subelements in %s element' % ltag) for e in elem: - _serialize_html(write, e, qnames, None) + _serialize_html(write, e, qnames, None, validate=validate) if ltag not in HTML_EMPTY: write("</" + tag + ">") if elem.tail: + if validate: + if '\0' in elem.tail: + raise ValueError('invalid characters') write(_escape_cdata(elem.tail)) def _serialize_text(write, elem): @@ -1021,9 +1094,12 @@ def _raise_serialization_error(text): "cannot serialize %r (type %s)" % (text, type(text).__name__) ) -def _escape_cdata(text): +def _escape_cdata(text, validate=False): # escape character data try: + if validate: + if not is_valid_text(text): + raise ValueError('invalid characters') # it's worth avoiding do-nothing calls for strings that are # shorter than 500 characters, or so. assume that's, by far, # the most common case in most applications. @@ -1037,9 +1113,12 @@ def _escape_cdata(text): except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib(text): +def _escape_attrib(text, validate=False): # escape attribute value try: + if validate: + if not is_valid_text(text): + raise ValueError('invalid attribute value') if "&" in text: text = text.replace("&", "&amp;") if "<" in text: @@ -1082,7 +1161,7 @@ def _escape_attrib_html(text): def tostring(element, encoding=None, method=None, *, xml_declaration=None, default_namespace=None, - short_empty_elements=True): + validate=False, short_empty_elements=True): """Generate string representation of XML element. All subelements are included. If encoding is "unicode", a string @@ -1101,6 +1180,7 @@ def tostring(element, encoding=None, method=None, *, xml_declaration=xml_declaration, default_namespace=default_namespace, method=method, + validate=validate, short_empty_elements=short_empty_elements) return stream.getvalue() @@ -1123,13 +1203,14 @@ def tell(self): def tostringlist(element, encoding=None, method=None, *, xml_declaration=None, default_namespace=None, - short_empty_elements=True): + validate=False, short_empty_elements=True): lst = [] stream = _ListDataStream(lst) ElementTree(element).write(stream, encoding, xml_declaration=xml_declaration, default_namespace=default_namespace, method=method, + validate=validate, short_empty_elements=short_empty_elements) return lst diff --git a/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst new file mode 100644 index 00000000000000..a4313cac07eea5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst @@ -0,0 +1,3 @@ +Add the *validate* option to :mod:`xml.etree.ElementTree` serialization +functions, which allows to validate the element or element tree before +serialization.