From 44cba6b4b8c45879d60442cee859cd2e9c408b27 Mon Sep 17 00:00:00 2001 From: ND Date: Sat, 27 Jun 2026 11:26:40 +0800 Subject: [PATCH] feat: make patch_xml respect custom Jinja2 delimiters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit patch_xml() had hardcoded regex patterns for {{ }}, {% %}, and {# #}, ignoring any custom delimiters set via jinja_env. This caused template variables to be silently left unreplaced when using non-default delimiters (like single braces { }) because XML tags split by Word were never stripped from inside the custom blocks. Changes: - Added jinja_env parameter to patch_xml() and all its call sites - Dynamic regex patterns for stripping XML tags inside Jinja2 blocks (pattern ②) - Dynamic regex patterns for HTML entity cleanup inside Jinja2 tags (pattern ⑥) - Default behavior unchanged when jinja_env is None - Added test with intentionally split XML runs and custom { } delimiters Co-Authored-By: Claude --- docxtpl/template.py | 79 ++++++++++++----- tests/custom_delimiters.py | 98 +++++++++++++++++++++ tests/templates/custom_delimiters_tpl.docx | Bin 0 -> 977 bytes 3 files changed, 155 insertions(+), 22 deletions(-) create mode 100644 tests/custom_delimiters.py create mode 100644 tests/templates/custom_delimiters_tpl.docx diff --git a/docxtpl/template.py b/docxtpl/template.py index f20280a..0a66d97 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -82,11 +82,39 @@ def write_xml(self, filename): with open(filename, "w") as fh: fh.write(self.get_xml()) - def patch_xml(self, src_xml): + @staticmethod + def _get_delim_repr(env, attr, default): + """Return the regex-escaped representation of a jinja2 delimiter.""" + if env is None: + return re.escape(default) + return re.escape(getattr(env, attr)) + + def patch_xml(self, src_xml, jinja_env=None): """Make a lots of cleaning to have a raw xml understandable by jinja2 : strip all unnecessary xml tags, manage table cell background color and colspan, unescape html entities, etc...""" + # Resolve delimiter strings (regex-escaped) for dynamic patterns. + # When jinja_env is None, defaults to standard Jinja2 delimiters. + vo = self._get_delim_repr(jinja_env, "variable_start_string", "{{") + vc = self._get_delim_repr(jinja_env, "variable_end_string", "}}") + bo = self._get_delim_repr(jinja_env, "block_start_string", "{%") + bc = self._get_delim_repr(jinja_env, "block_end_string", "%}") + co = self._get_delim_repr(jinja_env, "comment_start_string", "{#") + cc = self._get_delim_repr(jinja_env, "comment_end_string", "#}") + + # Build a union pattern matching any Jinja2 tag: + # block_start ... block_end | comment_start ... comment_end | variable_start ... variable_end + def _tag_union(): + parts = [] + if bo and bc: + parts.append(f"{bo}(?:(?!{bc}).)*") + if co and cc: + parts.append(f"{co}(?:(?!{cc}).)*") + if vo and vc: + parts.append(f"{vo}(?:(?!{vc}).)*") + return "|".join(parts) + # replace {{ by {{ ( works with {{ }} {% and %} {# and #}) src_xml = re.sub( r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}\#])(<[^>]*>)+(?=\})", @@ -103,12 +131,14 @@ def striptags(m): ".*?(|]*>)", "", m.group(0), flags=re.DOTALL ) - src_xml = re.sub( - r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", - striptags, - src_xml, - flags=re.DOTALL, - ) + tag_pat = _tag_union() + if tag_pat: + src_xml = re.sub( + tag_pat, + striptags, + src_xml, + flags=re.DOTALL, + ) # manage table cell colspan def colspan(m): @@ -286,19 +316,23 @@ def without_gridspan(m2): flags=re.DOTALL, ) - def clean_tags(m): + def _clean_inner(text): return ( - m.group(0) - .replace(r"‘", "'") - .replace("<", "<") - .replace(">", ">") - .replace("“", '"') - .replace("”", '"') - .replace("‘", "'") - .replace("’", "'") + text.replace("‘", "'").replace("<", "<").replace(">", ">").replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'") ) - src_xml = re.sub(r"(?<=\{[\{%])(.*?)(?=[\}%]})", clean_tags, src_xml) + # Build a dynamic pattern to match content *inside* any Jinja2 tag + # (between start and end delimiters) and apply HTML entity cleanup. + # Uses capture groups to preserve delimiter boundaries since + # lookbehind/lookahead widths can vary with custom delimiters. + clean_start = f"({vo}|{bo}|{co})" + clean_end = f"({vc}|{bc}|{cc})" + src_xml = re.sub( + clean_start + r"(.*?)" + clean_end, + lambda m: m.group(1) + _clean_inner(m.group(2)) + m.group(3), + src_xml, + flags=re.DOTALL, + ) return src_xml @@ -372,7 +406,8 @@ def render_footnotes( xml = self.patch_xml( part.blob.decode("utf-8") if isinstance(part.blob, bytes) - else part.blob + else part.blob, + jinja_env, ) xml = self.render_xml_part(xml, part, context, jinja_env) part._blob = xml.encode("utf-8") @@ -432,7 +467,7 @@ def resolve_paragraph(m): def build_xml(self, context, jinja_env=None): xml = self.get_xml() - xml = self.patch_xml(xml) + xml = self.patch_xml(xml, jinja_env) xml = self.render_xml_part(xml, self.docx._part, context, jinja_env) return xml @@ -459,7 +494,7 @@ def build_headers_footers_xml(self, context, uri, jinja_env=None): for relKey, part in self.get_headers_footers(uri): xml = self.get_part_xml(part) encoding = self.get_headers_footers_encoding(xml) - xml = self.patch_xml(xml) + xml = self.patch_xml(xml, jinja_env) xml = self.render_xml_part(xml, part, context, jinja_env) yield relKey, xml.encode(encoding) @@ -901,14 +936,14 @@ def get_undeclared_template_variables( # Get XML from the temporary document xml = self.xml_to_string(temp_doc._element.body) - xml = self.patch_xml(xml) + xml = self.patch_xml(xml, jinja_env) # Add headers and footers for uri in [self.HEADER_URI, self.FOOTER_URI]: for relKey, val in temp_doc._part.rels.items(): if (val.reltype == uri) and (val.target_part.blob): _xml = self.xml_to_string(parse_xml(val.target_part.blob)) - xml += self.patch_xml(_xml) + xml += self.patch_xml(_xml, jinja_env) if jinja_env: env = jinja_env diff --git a/tests/custom_delimiters.py b/tests/custom_delimiters.py new file mode 100644 index 0000000..dbbd621 --- /dev/null +++ b/tests/custom_delimiters.py @@ -0,0 +1,98 @@ +"""Test that custom Jinja2 delimiters work with patch_xml. + +This verifies that patch_xml properly strips XML tags from inside +user-configured Jinja2 blocks when using non-default delimiters +(like single braces {} instead of double braces {{}}). +""" +import sys, os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from docxtpl import DocxTemplate +from jinja2 import Environment +import zipfile, re + +TEMPLATE = os.path.join(os.path.dirname(__file__), + "templates", "custom_delimiters_tpl.docx") +OUTPUT = os.path.join(os.path.dirname(__file__), + "output", "custom_delimiters.docx") + + +def _get_text_from_docx(path): + """Extract plain text from a docx file for assertion.""" + with zipfile.ZipFile(path, "r") as z: + xml = z.read("word/document.xml").decode("utf-8") + return re.sub(r"<[^>]+>", "", xml) + + +def test_custom_delimiters(): + """Custom { } delimiters should render correctly even when + variables are split across multiple XML runs.""" + tpl = DocxTemplate(TEMPLATE) + jinja_env = Environment( + variable_start_string="{", + variable_end_string="}", + ) + tpl.render({"name": "Alice", "score": "95"}, jinja_env) + tpl.save(OUTPUT) + + text = _get_text_from_docx(OUTPUT) + print("Rendered text:", repr(text)) + + # Both variables should be substituted + assert "{name}" not in text, "Variable {name} was not rendered!" + assert "{score}" not in text, "Variable {score} was not rendered!" + assert "Alice" in text, "Name should appear in output" + assert "95" in text, "Score should appear in output" + + # No leftover braces + assert "{" not in text, "Leftover { in output" + assert "}" not in text, "Leftover } in output" + + print("✅ custom_delimiters: PASS") + + +def test_default_delimiters_still_work(): + """Default {{ }} delimiters should still work (backward compat).""" + import io as _io + + # Create a template with default delimiters + default_xml = """ + + + + Hello {{ + name + }}! + + +""" + buf = _io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", """ + + + + +""") + zf.writestr("_rels/.rels", """ + + +""") + zf.writestr("word/document.xml", default_xml) + + out_default = os.path.join(os.path.dirname(__file__), + "output", "custom_delimiters_default.docx") + tpl = DocxTemplate(buf) + tpl.render({"name": "Bob"}) + tpl.save(out_default) + + text = _get_text_from_docx(out_default) + print("Rendered text (default):", repr(text)) + assert "Bob" in text, "Name should appear in output" + assert "{{" not in text, "Leftover {{ in output" + print("✅ default_delimiters: PASS") + + +if __name__ == "__main__": + test_custom_delimiters() + test_default_delimiters_still_work() diff --git a/tests/templates/custom_delimiters_tpl.docx b/tests/templates/custom_delimiters_tpl.docx new file mode 100644 index 0000000000000000000000000000000000000000..32469104deb27db139e2cce370c749ffef9a2542 GIT binary patch literal 977 zcmWIWW@Zs#U|`^2xEOUi=6YqztB*k54n_tBVIUptoS#>cnpYAZQdy8%9IIE6n^QaC ztlwb+ftL5RT&J&V6`o>LoU(+yuJ6D?n**orsf7JiUmtyId-`Rki2_eE&ej;e&b-e4 z{)n9MO$n==tE4pzUS#A*SpVEtx$fPP1xyDUx*4r2T_(M*yO`I<>1iOdVsfzN%Q($T z!c_~b)OgnL7*=ciIkVVn!FGd1yE11*FHP2v@Ctp#b5p-1L%y(KuJ)ss2TuK**3-A7 z!lL^5EXYiMsDA&d`_aL+_wid^nbiE}3_uAX1?jd| zn}8vs2Mi%@ARS+nnp3Q=2O@jV@8)YV5Mg^TcX`G+Jy!N)p4Ou?Hb~VpEVG^BR^=J3 zzh|$k%n=KVdAG|yncuy9=1HGoR=VVsR92R?1wqkA9~jLst5z^9)c*NK$0)+U>AKii z$@F=jeCKTGS5Y=Ini2Gg-D~lT3m+n^`px{^#CiO8@^n?{t&35-8+Ns?&7b*J(*#lP zs*?&|HnAD$$o+qK*CMOYT|JL2_t(d#*~+JFG<*hWNRIlCb zi_5wO^inAk1A`!tF3&GY(ND=wF3km|08rF#owU*KhyhRA_i8R}W0O|Kg|BAa7Eork zaqu`3yY$GqM>0#D_E>vz3p2ke=>BW@FZuMQ@9q5(4`e6v`0U-48O7?+_t)Wg++G{| znEToja!%ep(6V<{;FW`Kz6C$OIafKTEvRwH?5fgJm2b9)l^@Dj`cQD?lY17fv%P2Z zRXuug{0N_`SF^;I+}rnFX0QjPu?MW$w`sn%-oYK04xJ4795UzM$KdH_mD>&k%>Oj& z$M@sU#SCwDM1P5oJ`!&5*|O2XkE1yzuWRYyqsKD-b!h9YUcEtY%cb`RXXFQXGct)V zClQEt28ISkC7=;V2?kvwdITUe{sn5ph>ienRyL3lW+2=N Jq?wpOJOC`MjaUEx literal 0 HcmV?d00001