From ffa06dbcab0a1cc3c58e311e2d7d80b4ce3f213a Mon Sep 17 00:00:00 2001
From: Gerard Gunnewijk <gerard.gunnewijk@live.nl>
Date: Thu, 23 Apr 2026 22:41:32 +0200
Subject: [PATCH] Fixed #87.

CID text bytes could contain 0D, which was unescaped in a string literal, and a parser would parse it to 0A, which in the reported font is capital G. Which when subsetting and not used was not available, resulting in a .undef glyph being shown. Fixed by both escaping more characters in string literals, and using hex-string for CID text in content streams.
---
 .../Content/ContentStream.cs                  |   6 +-
 .../Generation/PdfStream.cs                   |  82 +++++++---
 .../Content/ContentStreamTests.cs             | 119 ++++++++++++++
 .../Generation/PdfStreamTests.cs              | 145 ++++++++++++++++++
 4 files changed, 331 insertions(+), 21 deletions(-)
 create mode 100644 tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs
diff --git a/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs b/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs
index eab9d1b..2c67cc4 100644
--- a/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs
+++ b/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs
@@ -234,7 +234,7 @@ public ContentStream EndText()
     public ContentStream ShowTextTj(byte[] line)
     {
         InnerStream
-            .WriteStringLiteral(line)
+            .WriteStringHex(line)
             .Space()
             .Write("Tj")
             .NewLine();
@@ -250,7 +250,7 @@ public ContentStream ShowTextTj(byte[] line)
     public ContentStream MoveNextLineShowText(byte[] line)
     {
         InnerStream
-            .WriteStringLiteral(line)
+            .WriteStringHex(line)
             .Space()
             .Write("'")
             .NewLine();
@@ -285,7 +285,7 @@ public ContentStream MoveNextLineShowText(byte[] line, double wordSpacing, doubl
             .Space()
             .Write(characterSpacing)
             .Space()
-            .WriteStringLiteral(line)
+            .WriteStringHex(line)
             .Space()
             .Write("\"")
             .NewLine();
diff --git a/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs b/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs
index e6cbd12..378ffb6 100644
--- a/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs
+++ b/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs
@@ -259,16 +259,7 @@ internal PdfStream WriteStringLiteral(string value)
             : [.. Encoding.UTF8.Preamble, .. Encoding.UTF8.GetBytes(value)];
 
         foreach (var b in bytes)
-        {
-            if (b == '(')
-                Write('\\').Write('(');
-            else if (b == ')')
-                Write('\\').Write(')');
-            else if (b == '\\')
-                Write('\\').Write('\\');
-            else
-                Write(b);
-        }
+            _writeLiteralByte(b);
 
         WriteByte(0x29); // )
 
@@ -285,22 +276,77 @@ internal PdfStream WriteStringLiteral(byte[] encodedString)
         WriteByte(0x28); // (
 
         foreach (var b in encodedString)
+            _writeLiteralByte(b);
+
+        WriteByte(0x29); // )
+
+        return this;
+    }
+
+    /// <summary>
+    /// Write an encoded byte sequence to the stream as a PDF hexadecimal string.
+    /// </summary>
+    /// <remarks>
+    /// Hexadecimal strings (ISO 32000-1 §7.3.4.3) are the correct container for arbitrary
+    /// binary data such as CID-encoded show-text operands: they have no escape rules and
+    /// no end-of-line normalisation, so every byte round-trips exactly.
+    /// </remarks>
+    /// <param name="encodedString">The bytes to write.</param>
+    /// <returns>The <see cref="PdfStream"/> to support chaining operations.</returns>
+    internal PdfStream WriteStringHex(byte[] encodedString)
+    {
+        WriteByte(0x3C); // <
+
+        Span<byte> pair = stackalloc byte[2];
+        foreach (var b in encodedString)
+        {
+            pair[0] = _hexNibble(b >> 4);
+            pair[1] = _hexNibble(b & 0x0F);
+            Write(pair);
+        }
+
+        WriteByte(0x3E); // >
+
+        return this;
+    }
+
+    private void _writeLiteralByte(byte b)
+    {
+        switch (b)
         {
-            if (b == '(')
+            case (byte)'(':
                 Write('\\').Write('(');
-            else if (b == ')')
+                break;
+            case (byte)')':
                 Write('\\').Write(')');
-            else if (b == '\\')
+                break;
+            case (byte)'\\':
                 Write('\\').Write('\\');
-            else
+                break;
+            case 0x0A:
+                Write('\\').Write('n');
+                break;
+            case 0x0D:
+                Write('\\').Write('r');
+                break;
+            case 0x09:
+                Write('\\').Write('t');
+                break;
+            case 0x08:
+                Write('\\').Write('b');
+                break;
+            case 0x0C:
+                Write('\\').Write('f');
+                break;
+            default:
                 WriteByte(b);
+                break;
         }
-
-        WriteByte(0x29); // )
-
-        return this;
     }
 
+    private static byte _hexNibble(int n)
+        => (byte)( n < 10 ? ( '0' + n ) : ( 'A' + n - 10 ) );
+
     /// <summary>
     /// Write an array of numbers to the pdf stream
     /// </summary>
diff --git a/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs b/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs
new file mode 100644
index 0000000..d2f44db
--- /dev/null
+++ b/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs
@@ -0,0 +1,119 @@
+using Synercoding.FileFormats.Pdf.Content;
+using Synercoding.FileFormats.Pdf.Generation;
+using Synercoding.FileFormats.Pdf.Generation.Internal;
+using Synercoding.FileFormats.Pdf.Primitives;
+using System.Text;
+
+namespace Synercoding.FileFormats.Pdf.Tests.Content;
+
+/// <summary>
+/// Regression tests for issue #87 — CID-encoded show-text operands must survive the
+/// serialiser unchanged. When literal strings were used, bytes containing 0x0D were
+/// normalised to 0x0A by the PDF parser (ISO 32000-1 §7.3.4.2), which silently
+/// shifted the CID lookup and produced wrong or missing glyphs.
+/// </summary>
+public class ContentStreamTests : IDisposable
+{
+    private readonly TableBuilder _tableBuilder;
+    private readonly CachedResources _cachedResources;
+    private readonly PageResources _pageResources;
+    private readonly ContentStream _contentStream;
+
+    public ContentStreamTests()
+    {
+        _tableBuilder = new TableBuilder();
+        _cachedResources = new CachedResources(_tableBuilder);
+        _pageResources = new PageResources(_tableBuilder, _cachedResources);
+        _contentStream = new ContentStream(_tableBuilder.ReserveId(), _pageResources);
+    }
+
+    public void Dispose()
+    {
+        _contentStream.Dispose();
+        _pageResources.Dispose();
+    }
+
+    [Fact]
+    public void ShowTextTj_GlyphId0x000D_WritesHexString()
+    {
+        // Reproduces the original issue #87 case: Source Sans Pro capital 'J'
+        // maps to glyph id 13 (0x000D). The bytes must round-trip verbatim.
+        _contentStream.ShowTextTj(new byte[] { 0x00, 0x0D });
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<000D>", written);
+        Assert.Contains("Tj", written);
+    }
+
+    [Fact]
+    public void ShowTextTj_GlyphIdWithCarriageReturnInHighByte_WritesHexString()
+    {
+        _contentStream.ShowTextTj(new byte[] { 0x0D, 0x42 });
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<0D42>", written);
+    }
+
+    [Fact]
+    public void ShowTextTj_ConsecutiveCidsForming0D0A_PreservesAlignment()
+    {
+        // A literal string would collapse 0D 0A to a single 0x0A, shifting
+        // alignment for every subsequent 2-byte CID from that point on.
+        _contentStream.ShowTextTj(new byte[] { 0x01, 0x0D, 0x0A, 0x02 });
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<010D0A02>", written);
+    }
+
+    [Fact]
+    public void ShowTextTj_BytesMatchingLiteralDelimiters_AreEmittedAsHex()
+    {
+        // A glyph id whose byte encoding contains '(' / ')' / '\' was previously
+        // escaped for a literal string; hex strings write them verbatim.
+        _contentStream.ShowTextTj(new byte[] { 0x28, 0x29, 0x5C });
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<28295C>", written);
+        Assert.DoesNotContain("\\(", written);
+        Assert.DoesNotContain("\\)", written);
+        Assert.DoesNotContain("\\\\", written);
+    }
+
+    [Fact]
+    public void ShowTextTj_DoesNotEmitLiteralStringDelimiters()
+    {
+        // Regression guard: the operand must no longer be wrapped in ( … ).
+        _contentStream.ShowTextTj(new byte[] { 0x00, 0x0D });
+
+        var rawData = _contentStream.InnerStream.ToStreamObject().RawData;
+
+        Assert.DoesNotContain((byte)'(', rawData);
+        Assert.DoesNotContain((byte)')', rawData);
+    }
+
+    [Fact]
+    public void MoveNextLineShowText_GlyphId0x000D_WritesHexString()
+    {
+        _contentStream.MoveNextLineShowText(new byte[] { 0x00, 0x0D });
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<000D>", written);
+        Assert.Contains("'", written);
+    }
+
+    [Fact]
+    public void MoveNextLineShowText_WithSpacing_WritesHexString()
+    {
+        _contentStream.MoveNextLineShowText(new byte[] { 0x00, 0x0D }, wordSpacing: 1.0, characterSpacing: 2.0);
+
+        var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData);
+
+        Assert.Contains("<000D>", written);
+        Assert.Contains("\"", written);
+    }
+}
diff --git a/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs b/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs
index 533b840..1aa256a 100644
--- a/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs
+++ b/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs
@@ -266,6 +266,151 @@ public void Test_ToStreamObject_WithThreeFilters_ReturnsStreamObjectWithFilterAr
         Assert.Equal(thirdEncoded, streamObject.RawData);
     }
 
+    // Regression tests for issue #87 — the literal-string escape table and the
+    // hex-string fallback used for CID-encoded show-text operands.
+
+    [Fact]
+    public void WriteStringHex_EmptyArray_WritesEmptyAngleBrackets()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(Array.Empty<byte>());
+
+        Assert.Equal(new byte[] { 0x3C, 0x3E }, memoryStream.ToArray());
+    }
+
+    [Fact]
+    public void WriteStringHex_GlyphId0x000D_PreservesCarriageReturn()
+    {
+        // Issue #87: Source Sans Pro capital 'J' has glyph id 13 (0x000D).
+        // Previously this was written as a literal string, where the parser
+        // would silently normalise 0x0D to 0x0A and the consumer would look
+        // up CID 10 instead of 13.
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(new byte[] { 0x00, 0x0D });
+
+        Assert.Equal("<000D>", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringHex_GlyphIdWithCarriageReturnInHighByte_IsPreserved()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(new byte[] { 0x0D, 0x42 });
+
+        Assert.Equal("<0D42>", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringHex_ConsecutiveCidsForming0D0A_PreservesAlignment()
+    {
+        // A literal string would collapse the 0D 0A pair into a single 0x0A,
+        // shifting alignment for every subsequent 2-byte CID. A hex string
+        // round-trips every byte.
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(new byte[] { 0x01, 0x0D, 0x0A, 0x02 });
+
+        Assert.Equal("<010D0A02>", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringHex_BytesThatWouldBeEscapedInLiteral_AreWrittenRaw()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(new byte[] { 0x28, 0x29, 0x5C });
+
+        Assert.Equal("<28295C>", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringHex_AllBytes_RoundTripExactly()
+    {
+        var input = new byte[256];
+        for (int i = 0; i < 256; i++)
+            input[i] = (byte)i;
+
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringHex(input);
+
+        var output = Encoding.ASCII.GetString(memoryStream.ToArray());
+        Assert.StartsWith("<", output);
+        Assert.EndsWith(">", output);
+        Assert.Equal(( input.Length * 2 ) + 2, output.Length);
+
+        // Parse the hex back and verify every byte round-trips.
+        var hex = output[1..^1];
+        var roundTripped = new byte[input.Length];
+        for (int i = 0; i < input.Length; i++)
+            roundTripped[i] = Convert.ToByte(hex.Substring(i * 2, 2), 16);
+        Assert.Equal(input, roundTripped);
+    }
+
+    [Fact]
+    public void WriteStringLiteral_String_EscapesCarriageReturn()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringLiteral("a\rb");
+
+        Assert.Equal("(a\\rb)", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringLiteral_String_EscapesLineFeed()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringLiteral("a\nb");
+
+        Assert.Equal("(a\\nb)", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringLiteral_String_EscapesTab()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringLiteral("a\tb");
+
+        Assert.Equal("(a\\tb)", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringLiteral_String_StillEscapesParenthesesAndBackslash()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringLiteral("(a)\\b");
+
+        Assert.Equal("(\\(a\\)\\\\b)", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
+    [Fact]
+    public void WriteStringLiteral_Bytes_EscapesControlCharacters()
+    {
+        using var memoryStream = new MemoryStream();
+        var pdfStream = new PdfStream(memoryStream);
+
+        pdfStream.WriteStringLiteral(new byte[] { 0x0D, 0x0A, 0x09, 0x08, 0x0C });
+
+        Assert.Equal("(\\r\\n\\t\\b\\f)", Encoding.ASCII.GetString(memoryStream.ToArray()));
+    }
+
     private class PassThroughFilterStub : IStreamFilter
     {
         public PdfName Name => PdfName.Get("PassThrough");