From 6bc21cabb14ec74424a8b6ff82c84f106723ceca Mon Sep 17 00:00:00 2001 From: HaydenMichel8 Date: Wed, 24 Jun 2026 15:28:22 -0400 Subject: [PATCH 1/5] Add support for parsing digraphs, <% as a replacement for { and <: as a replacement for [ --- cxxheaderparser/lexer.py | 54 +++++++++- tests/test_digraphs.py | 213 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 tests/test_digraphs.py diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index ff071af..05209ef 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -665,8 +665,33 @@ def return_tokens(self, toks: typing.Sequence[LexToken]) -> None: class LexerTokenStream(TokenStream): """ Provides tokens from using PlyLexer on the given input text + + This class also handles C++ digraphs (ISO C++ §2.6 [lex.digraph]), which + are two-character alternative representations for certain tokens:: + + <% -> { + %> -> } + <: -> [ + :> -> ] + %: -> # (preprocessor; rejected later as unsupported) + + Digraph recognition happens here, after the PLY tokenizer emits individual + characters, so the rest of the parser sees only the canonical tokens. """ + # Maps (first_token_type, second_token_value) -> replacement_token_type + # Only two-token digraph pairs that the PLY lexer will split are listed. + # The PLY lexer emits '<', '%', and ':' as literal tokens (single chars), + # so each digraph arrives as two consecutive tokens. + _digraph_map: typing.Dict[typing.Tuple[str, str], str] = { + ("%", ">"): "}", # %> -> } + ("<", "%"): "{", # <% -> { + ("<", ":"): "[", # <: -> [ + (":", ">"): "]", # :> -> ] + # %: -> # would produce a PP_DIRECTIVE; we leave it for the existing + # preprocessor-directive error path rather than silently mangling it. + } + _user_defined_literal_start = { "FLOAT_CONST", "HEX_FLOAT_CONST", @@ -702,9 +727,36 @@ def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool: return False udl_start = self._user_defined_literal_start + digraph_map = self._digraph_map while True: tok.location = self._lex.current_location() + + # Detect C++ digraphs: two consecutive literal tokens that together + # form an alternative representation of a single token (ISO C++ §2.6). + # We peek at the next token; if the pair is a known digraph we merge + # them into the canonical single-character token *before* appending + # to the buffer, so the rest of the parser never sees the raw digraph. + if tok.type in ("<", "%", ":"): + tok2 = get_token() + if tok2 is not None: + replacement = digraph_map.get((tok.type, tok2.value)) + if replacement is not None: + # Reuse tok, replace its type/value with the canonical token. + tok.type = replacement + tok.value = replacement + tokbuf.append(tok) + tok = get_token() + if tok is None: + break + continue + else: + # Not a digraph — process tok normally and re-queue tok2. + tokbuf.append(tok) + tok = tok2 + continue + # tok2 is None (EOF): fall through to append tok and return. + tokbuf.append(tok) if tok.type == "NEWLINE": @@ -867,4 +919,4 @@ def get_doxygen_after(self) -> typing.Optional[str]: try: lex.runmain(lexer=PlyLexer(None)) except EOFError: - pass + pass \ No newline at end of file diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py new file mode 100644 index 0000000..a6373d4 --- /dev/null +++ b/tests/test_digraphs.py @@ -0,0 +1,213 @@ +from cxxheaderparser.simple import parse_string, ParsedData +from cxxheaderparser.types import ( + FundamentalSpecifier, + NameSpecifier, + PQName, + Type, + Variable, + Function, + FunctionType, + Parameter, + Array, +) + + +def test_digraph_brace_open_close_function(): + """<% %> should work as { } in a function body context (body is skipped).""" + # The parser skips function bodies but must recognise the braces. + content = """\ +#include +int main() +<% + std::cout << "Hello, World!" << std::endl; + return 0; +%> +""" + # parse_string should not raise + result = parse_string(content) + assert isinstance(result, ParsedData) + + +def test_digraph_struct_body(): + """<% %> should work as { } around a struct body.""" + content = """\ +struct Point +<% + int x; + int y; +%>; +""" + result = parse_string(content) + assert len(result.namespace.classes) == 1 + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Point" + field_names = [f.name for f in cls.fields] + assert field_names == ["x", "y"] + + +def test_digraph_namespace_body(): + """<% %> should work as { } around a namespace body.""" + content = """\ +namespace myns +<% + int value; +%> +""" + result = parse_string(content) + assert "myns" in result.namespace.namespaces + ns = result.namespace.namespaces["myns"] + assert len(ns.variables) == 1 + assert ns.variables[0].name.segments[-1].name == "value" + + +def test_digraph_nested_braces(): + """Nested digraph brace pairs should work correctly.""" + content = """\ +namespace outer +<% + struct Inner + <% + int val; + %>; +%> +""" + result = parse_string(content) + assert "outer" in result.namespace.namespaces + ns = result.namespace.namespaces["outer"] + assert len(ns.classes) == 1 + inner = ns.classes[0] + assert inner.class_decl.typename.segments[-1].name == "Inner" + assert inner.fields[0].name == "val" + + +def test_digraph_mixed_braces(): + """Digraph and canonical braces can be mixed freely.""" + content = """\ +namespace ns +<% + struct Foo { + int a; + }; +%> +""" + result = parse_string(content) + assert "ns" in result.namespace.namespaces + ns = result.namespace.namespaces["ns"] + assert len(ns.classes) == 1 + + +def test_digraph_array_subscript(): + """<: :> should work as [ ] in an array declaration.""" + content = """\ +int arr<:10:>; +""" + result = parse_string(content) + assert len(result.namespace.variables) == 1 + var = result.namespace.variables[0] + assert var.name.segments[-1].name == "arr" + # The type should be an array of 10 ints + assert isinstance(var.type, Array) + + +def test_digraph_array_and_brace(): + """Both digraph pairs used together.""" + content = """\ +struct Grid +<% + float data<:4:>; +%>; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Grid" + field = cls.fields[0] + assert field.name == "data" + assert isinstance(field.type, Array) + + +def test_canonical_tokens_unaffected(): + """Normal { } [ ] tokens must continue to work after digraph support.""" + content = """\ +namespace ns { + struct Foo { + int arr[5]; + }; +} +""" + result = parse_string(content) + assert "ns" in result.namespace.namespaces + ns = result.namespace.namespaces["ns"] + cls = ns.classes[0] + assert cls.fields[0].name == "arr" + + +def test_template_angle_brackets_unaffected(): + """< > used as template angle brackets must NOT be treated as digraphs.""" + content = """\ +template +struct Container +{ + T data[N]; +}; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "Container" + assert len(cls.class_decl.template.params) == 2 + + +def test_shift_left_unaffected(): + """The << operator (SHIFT_LEFT token) must not be affected by digraph detection.""" + content = """\ +template +void fn(T x); +""" + result = parse_string(content) + assert len(result.namespace.functions) == 1 + assert result.namespace.functions[0].name.segments[-1].name == "fn" + + +def test_percent_operator_unaffected(): + """A bare % in an expression context must not be altered.""" + # The parser skips default parameter expressions, so we embed % there. + content = """\ +void fn(int x = 10 % 3); +""" + result = parse_string(content) + assert result.namespace.functions[0].name.segments[-1].name == "fn" + + +def test_digraph_enum(): + """<% %> should work as { } in an enum definition.""" + content = """\ +enum Color +<% + Red, + Green, + Blue +%>; +""" + result = parse_string(content) + assert len(result.namespace.enums) == 1 + en = result.namespace.enums[0] + values = [v.name for v in en.values] + assert values == ["Red", "Green", "Blue"] + + +def test_digraph_class_with_methods(): + """<% %> braces work for a class with member function declarations.""" + content = """\ +class MyClass +<% +public: + MyClass(); + ~MyClass(); + int getValue() const; +%>; +""" + result = parse_string(content) + cls = result.namespace.classes[0] + assert cls.class_decl.typename.segments[-1].name == "MyClass" + method_names = [m.name.segments[-1].name for m in cls.methods] + assert "MyClass" in method_names + assert "getValue" in method_names \ No newline at end of file From fe99770edd1773a2dc0761ce315bf15cb6c67264 Mon Sep 17 00:00:00 2001 From: Hayden Michel Date: Wed, 24 Jun 2026 15:49:24 -0400 Subject: [PATCH 2/5] Add newline at end of file in lexer.py --- cxxheaderparser/lexer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 05209ef..863a5bd 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -919,4 +919,5 @@ def get_doxygen_after(self) -> typing.Optional[str]: try: lex.runmain(lexer=PlyLexer(None)) except EOFError: - pass \ No newline at end of file + pass + From 638b5ecf37e604f8d5653585a0c621a75cca5aca Mon Sep 17 00:00:00 2001 From: Hayden Michel Date: Wed, 24 Jun 2026 15:49:59 -0400 Subject: [PATCH 3/5] Add new line at the end of test_digraphs.py --- tests/test_digraphs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py index a6373d4..cfd7890 100644 --- a/tests/test_digraphs.py +++ b/tests/test_digraphs.py @@ -210,4 +210,5 @@ class MyClass assert cls.class_decl.typename.segments[-1].name == "MyClass" method_names = [m.name.segments[-1].name for m in cls.methods] assert "MyClass" in method_names - assert "getValue" in method_names \ No newline at end of file + assert "getValue" in method_names + From 09b3cf927caea860084b9c7b1239c43b9ef29797 Mon Sep 17 00:00:00 2001 From: Hayden Michel Date: Wed, 24 Jun 2026 15:58:14 -0400 Subject: [PATCH 4/5] Remove unnecessary blank line in lexer.py --- cxxheaderparser/lexer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 863a5bd..8c69eee 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -920,4 +920,3 @@ def get_doxygen_after(self) -> typing.Optional[str]: lex.runmain(lexer=PlyLexer(None)) except EOFError: pass - From 490ba835968723cd1538a95bed62fcb7e8ff052e Mon Sep 17 00:00:00 2001 From: Hayden Michel Date: Wed, 24 Jun 2026 15:58:44 -0400 Subject: [PATCH 5/5] Remove trailing whitespace in test_digraphs.py --- tests/test_digraphs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py index cfd7890..489f32e 100644 --- a/tests/test_digraphs.py +++ b/tests/test_digraphs.py @@ -211,4 +211,3 @@ class MyClass method_names = [m.name.segments[-1].name for m in cls.methods] assert "MyClass" in method_names assert "getValue" in method_names -