From 6bc21cabb14ec74424a8b6ff82c84f106723ceca Mon Sep 17 00:00:00 2001
From: HaydenMichel8 <haydenmichel8@gmail.com>
Date: Wed, 24 Jun 2026 15:28:22 -0400
Subject: [PATCH 1/5] Add support for parsing digraphs, <% as a replacement for
 { and <: as a replacement for [

---
 cxxheaderparser/lexer.py |  54 +++++++++-
 tests/test_digraphs.py   | 213 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 266 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_digraphs.py

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index ff071af..05209ef 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -665,8 +665,33 @@ def return_tokens(self, toks: typing.Sequence[LexToken]) -> None:
 class LexerTokenStream(TokenStream):
     """
     Provides tokens from using PlyLexer on the given input text
+
+    This class also handles C++ digraphs (ISO C++ §2.6 [lex.digraph]), which
+    are two-character alternative representations for certain tokens::
+
+        <%   ->  {
+        %>   ->  }
+        <:   ->  [
+        :>   ->  ]
+        %:   ->  #  (preprocessor; rejected later as unsupported)
+
+    Digraph recognition happens here, after the PLY tokenizer emits individual
+    characters, so the rest of the parser sees only the canonical tokens.
     """
 
+    # Maps (first_token_type, second_token_value) -> replacement_token_type
+    # Only two-token digraph pairs that the PLY lexer will split are listed.
+    # The PLY lexer emits '<', '%', and ':' as literal tokens (single chars),
+    # so each digraph arrives as two consecutive tokens.
+    _digraph_map: typing.Dict[typing.Tuple[str, str], str] = {
+        ("%", ">"): "}",  # %> -> }
+        ("<", "%"): "{",  # <% -> {
+        ("<", ":"): "[",  # <: -> [
+        (":", ">"): "]",  # :> -> ]
+        # %: -> # would produce a PP_DIRECTIVE; we leave it for the existing
+        # preprocessor-directive error path rather than silently mangling it.
+    }
+
     _user_defined_literal_start = {
         "FLOAT_CONST",
         "HEX_FLOAT_CONST",
@@ -702,9 +727,36 @@ def _fill_tokbuf(self, tokbuf: typing.Deque[LexToken]) -> bool:
             return False
 
         udl_start = self._user_defined_literal_start
+        digraph_map = self._digraph_map
 
         while True:
             tok.location = self._lex.current_location()
+
+            # Detect C++ digraphs: two consecutive literal tokens that together
+            # form an alternative representation of a single token (ISO C++ §2.6).
+            # We peek at the next token; if the pair is a known digraph we merge
+            # them into the canonical single-character token *before* appending
+            # to the buffer, so the rest of the parser never sees the raw digraph.
+            if tok.type in ("<", "%", ":"):
+                tok2 = get_token()
+                if tok2 is not None:
+                    replacement = digraph_map.get((tok.type, tok2.value))
+                    if replacement is not None:
+                        # Reuse tok, replace its type/value with the canonical token.
+                        tok.type = replacement
+                        tok.value = replacement
+                        tokbuf.append(tok)
+                        tok = get_token()
+                        if tok is None:
+                            break
+                        continue
+                    else:
+                        # Not a digraph — process tok normally and re-queue tok2.
+                        tokbuf.append(tok)
+                        tok = tok2
+                        continue
+                # tok2 is None (EOF): fall through to append tok and return.
+
             tokbuf.append(tok)
 
             if tok.type == "NEWLINE":
@@ -867,4 +919,4 @@ def get_doxygen_after(self) -> typing.Optional[str]:
     try:
         lex.runmain(lexer=PlyLexer(None))
     except EOFError:
-        pass
+        pass
\ No newline at end of file
diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py
new file mode 100644
index 0000000..a6373d4
--- /dev/null
+++ b/tests/test_digraphs.py
@@ -0,0 +1,213 @@
+from cxxheaderparser.simple import parse_string, ParsedData
+from cxxheaderparser.types import (
+    FundamentalSpecifier,
+    NameSpecifier,
+    PQName,
+    Type,
+    Variable,
+    Function,
+    FunctionType,
+    Parameter,
+    Array,
+)
+
+
+def test_digraph_brace_open_close_function():
+    """<% %> should work as { } in a function body context (body is skipped)."""
+    # The parser skips function bodies but must recognise the braces.
+    content = """\
+#include <iostream>
+int main()
+<%
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+%>
+"""
+    # parse_string should not raise
+    result = parse_string(content)
+    assert isinstance(result, ParsedData)
+
+
+def test_digraph_struct_body():
+    """<% %> should work as { } around a struct body."""
+    content = """\
+struct Point
+<%
+    int x;
+    int y;
+%>;
+"""
+    result = parse_string(content)
+    assert len(result.namespace.classes) == 1
+    cls = result.namespace.classes[0]
+    assert cls.class_decl.typename.segments[-1].name == "Point"
+    field_names = [f.name for f in cls.fields]
+    assert field_names == ["x", "y"]
+
+
+def test_digraph_namespace_body():
+    """<% %> should work as { } around a namespace body."""
+    content = """\
+namespace myns
+<%
+    int value;
+%>
+"""
+    result = parse_string(content)
+    assert "myns" in result.namespace.namespaces
+    ns = result.namespace.namespaces["myns"]
+    assert len(ns.variables) == 1
+    assert ns.variables[0].name.segments[-1].name == "value"
+
+
+def test_digraph_nested_braces():
+    """Nested digraph brace pairs should work correctly."""
+    content = """\
+namespace outer
+<%
+    struct Inner
+    <%
+        int val;
+    %>;
+%>
+"""
+    result = parse_string(content)
+    assert "outer" in result.namespace.namespaces
+    ns = result.namespace.namespaces["outer"]
+    assert len(ns.classes) == 1
+    inner = ns.classes[0]
+    assert inner.class_decl.typename.segments[-1].name == "Inner"
+    assert inner.fields[0].name == "val"
+
+
+def test_digraph_mixed_braces():
+    """Digraph and canonical braces can be mixed freely."""
+    content = """\
+namespace ns
+<%
+    struct Foo {
+        int a;
+    };
+%>
+"""
+    result = parse_string(content)
+    assert "ns" in result.namespace.namespaces
+    ns = result.namespace.namespaces["ns"]
+    assert len(ns.classes) == 1
+
+
+def test_digraph_array_subscript():
+    """<: :> should work as [ ] in an array declaration."""
+    content = """\
+int arr<:10:>;
+"""
+    result = parse_string(content)
+    assert len(result.namespace.variables) == 1
+    var = result.namespace.variables[0]
+    assert var.name.segments[-1].name == "arr"
+    # The type should be an array of 10 ints
+    assert isinstance(var.type, Array)
+
+
+def test_digraph_array_and_brace():
+    """Both digraph pairs used together."""
+    content = """\
+struct Grid
+<%
+    float data<:4:>;
+%>;
+"""
+    result = parse_string(content)
+    cls = result.namespace.classes[0]
+    assert cls.class_decl.typename.segments[-1].name == "Grid"
+    field = cls.fields[0]
+    assert field.name == "data"
+    assert isinstance(field.type, Array)
+
+
+def test_canonical_tokens_unaffected():
+    """Normal { } [ ] tokens must continue to work after digraph support."""
+    content = """\
+namespace ns {
+    struct Foo {
+        int arr[5];
+    };
+}
+"""
+    result = parse_string(content)
+    assert "ns" in result.namespace.namespaces
+    ns = result.namespace.namespaces["ns"]
+    cls = ns.classes[0]
+    assert cls.fields[0].name == "arr"
+
+
+def test_template_angle_brackets_unaffected():
+    """< > used as template angle brackets must NOT be treated as digraphs."""
+    content = """\
+template <typename T, int N>
+struct Container
+{
+    T data[N];
+};
+"""
+    result = parse_string(content)
+    cls = result.namespace.classes[0]
+    assert cls.class_decl.typename.segments[-1].name == "Container"
+    assert len(cls.class_decl.template.params) == 2
+
+
+def test_shift_left_unaffected():
+    """The << operator (SHIFT_LEFT token) must not be affected by digraph detection."""
+    content = """\
+template <typename T>
+void fn(T x);
+"""
+    result = parse_string(content)
+    assert len(result.namespace.functions) == 1
+    assert result.namespace.functions[0].name.segments[-1].name == "fn"
+
+
+def test_percent_operator_unaffected():
+    """A bare % in an expression context must not be altered."""
+    # The parser skips default parameter expressions, so we embed % there.
+    content = """\
+void fn(int x = 10 % 3);
+"""
+    result = parse_string(content)
+    assert result.namespace.functions[0].name.segments[-1].name == "fn"
+
+
+def test_digraph_enum():
+    """<% %> should work as { } in an enum definition."""
+    content = """\
+enum Color
+<%
+    Red,
+    Green,
+    Blue
+%>;
+"""
+    result = parse_string(content)
+    assert len(result.namespace.enums) == 1
+    en = result.namespace.enums[0]
+    values = [v.name for v in en.values]
+    assert values == ["Red", "Green", "Blue"]
+
+
+def test_digraph_class_with_methods():
+    """<% %> braces work for a class with member function declarations."""
+    content = """\
+class MyClass
+<%
+public:
+    MyClass();
+    ~MyClass();
+    int getValue() const;
+%>;
+"""
+    result = parse_string(content)
+    cls = result.namespace.classes[0]
+    assert cls.class_decl.typename.segments[-1].name == "MyClass"
+    method_names = [m.name.segments[-1].name for m in cls.methods]
+    assert "MyClass" in method_names
+    assert "getValue" in method_names
\ No newline at end of file

From fe99770edd1773a2dc0761ce315bf15cb6c67264 Mon Sep 17 00:00:00 2001
From: Hayden Michel <haydenmichel8@gmail.com>
Date: Wed, 24 Jun 2026 15:49:24 -0400
Subject: [PATCH 2/5] Add newline at end of file in lexer.py

---
 cxxheaderparser/lexer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 05209ef..863a5bd 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -919,4 +919,5 @@ def get_doxygen_after(self) -> typing.Optional[str]:
     try:
         lex.runmain(lexer=PlyLexer(None))
     except EOFError:
-        pass
\ No newline at end of file
+        pass
+        

From 638b5ecf37e604f8d5653585a0c621a75cca5aca Mon Sep 17 00:00:00 2001
From: Hayden Michel <haydenmichel8@gmail.com>
Date: Wed, 24 Jun 2026 15:49:59 -0400
Subject: [PATCH 3/5] Add new line at the end of test_digraphs.py

---
 tests/test_digraphs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py
index a6373d4..cfd7890 100644
--- a/tests/test_digraphs.py
+++ b/tests/test_digraphs.py
@@ -210,4 +210,5 @@ class MyClass
     assert cls.class_decl.typename.segments[-1].name == "MyClass"
     method_names = [m.name.segments[-1].name for m in cls.methods]
     assert "MyClass" in method_names
-    assert "getValue" in method_names
\ No newline at end of file
+    assert "getValue" in method_names
+    

From 09b3cf927caea860084b9c7b1239c43b9ef29797 Mon Sep 17 00:00:00 2001
From: Hayden Michel <haydenmichel8@gmail.com>
Date: Wed, 24 Jun 2026 15:58:14 -0400
Subject: [PATCH 4/5] Remove unnecessary blank line in lexer.py

---
 cxxheaderparser/lexer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py
index 863a5bd..8c69eee 100644
--- a/cxxheaderparser/lexer.py
+++ b/cxxheaderparser/lexer.py
@@ -920,4 +920,3 @@ def get_doxygen_after(self) -> typing.Optional[str]:
         lex.runmain(lexer=PlyLexer(None))
     except EOFError:
         pass
-        

From 490ba835968723cd1538a95bed62fcb7e8ff052e Mon Sep 17 00:00:00 2001
From: Hayden Michel <haydenmichel8@gmail.com>
Date: Wed, 24 Jun 2026 15:58:44 -0400
Subject: [PATCH 5/5] Remove trailing whitespace in test_digraphs.py

---
 tests/test_digraphs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_digraphs.py b/tests/test_digraphs.py
index cfd7890..489f32e 100644
--- a/tests/test_digraphs.py
+++ b/tests/test_digraphs.py
@@ -211,4 +211,3 @@ class MyClass
     method_names = [m.name.segments[-1].name for m in cls.methods]
     assert "MyClass" in method_names
     assert "getValue" in method_names
-