From ffa06dbcab0a1cc3c58e311e2d7d80b4ce3f213a Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Thu, 23 Apr 2026 22:41:32 +0200 Subject: [PATCH] Fixed #87. CID text bytes could contain 0D, which was unescaped in a string literal, and a parser would parse it to 0A, which in the reported font is capital G. Which when subsetting and not used was not available, resulting in a .undef glyph being shown. Fixed by both escaping more characters in string literals, and using hex-string for CID text in content streams. --- .../Content/ContentStream.cs | 6 +- .../Generation/PdfStream.cs | 82 +++++++--- .../Content/ContentStreamTests.cs | 119 ++++++++++++++ .../Generation/PdfStreamTests.cs | 145 ++++++++++++++++++ 4 files changed, 331 insertions(+), 21 deletions(-) create mode 100644 tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs diff --git a/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs b/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs index eab9d1b..2c67cc4 100644 --- a/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs +++ b/src/Synercoding.FileFormats.Pdf/Content/ContentStream.cs @@ -234,7 +234,7 @@ public ContentStream EndText() public ContentStream ShowTextTj(byte[] line) { InnerStream - .WriteStringLiteral(line) + .WriteStringHex(line) .Space() .Write("Tj") .NewLine(); @@ -250,7 +250,7 @@ public ContentStream ShowTextTj(byte[] line) public ContentStream MoveNextLineShowText(byte[] line) { InnerStream - .WriteStringLiteral(line) + .WriteStringHex(line) .Space() .Write("'") .NewLine(); @@ -285,7 +285,7 @@ public ContentStream MoveNextLineShowText(byte[] line, double wordSpacing, doubl .Space() .Write(characterSpacing) .Space() - .WriteStringLiteral(line) + .WriteStringHex(line) .Space() .Write("\"") .NewLine(); diff --git a/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs b/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs index e6cbd12..378ffb6 100644 --- a/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs +++ b/src/Synercoding.FileFormats.Pdf/Generation/PdfStream.cs @@ -259,16 +259,7 @@ internal PdfStream WriteStringLiteral(string value) : [.. Encoding.UTF8.Preamble, .. Encoding.UTF8.GetBytes(value)]; foreach (var b in bytes) - { - if (b == '(') - Write('\\').Write('('); - else if (b == ')') - Write('\\').Write(')'); - else if (b == '\\') - Write('\\').Write('\\'); - else - Write(b); - } + _writeLiteralByte(b); WriteByte(0x29); // ) @@ -285,22 +276,77 @@ internal PdfStream WriteStringLiteral(byte[] encodedString) WriteByte(0x28); // ( foreach (var b in encodedString) + _writeLiteralByte(b); + + WriteByte(0x29); // ) + + return this; + } + + /// + /// Write an encoded byte sequence to the stream as a PDF hexadecimal string. + /// + /// + /// Hexadecimal strings (ISO 32000-1 §7.3.4.3) are the correct container for arbitrary + /// binary data such as CID-encoded show-text operands: they have no escape rules and + /// no end-of-line normalisation, so every byte round-trips exactly. + /// + /// The bytes to write. + /// The to support chaining operations. + internal PdfStream WriteStringHex(byte[] encodedString) + { + WriteByte(0x3C); // < + + Span pair = stackalloc byte[2]; + foreach (var b in encodedString) + { + pair[0] = _hexNibble(b >> 4); + pair[1] = _hexNibble(b & 0x0F); + Write(pair); + } + + WriteByte(0x3E); // > + + return this; + } + + private void _writeLiteralByte(byte b) + { + switch (b) { - if (b == '(') + case (byte)'(': Write('\\').Write('('); - else if (b == ')') + break; + case (byte)')': Write('\\').Write(')'); - else if (b == '\\') + break; + case (byte)'\\': Write('\\').Write('\\'); - else + break; + case 0x0A: + Write('\\').Write('n'); + break; + case 0x0D: + Write('\\').Write('r'); + break; + case 0x09: + Write('\\').Write('t'); + break; + case 0x08: + Write('\\').Write('b'); + break; + case 0x0C: + Write('\\').Write('f'); + break; + default: WriteByte(b); + break; } - - WriteByte(0x29); // ) - - return this; } + private static byte _hexNibble(int n) + => (byte)( n < 10 ? ( '0' + n ) : ( 'A' + n - 10 ) ); + /// /// Write an array of numbers to the pdf stream /// diff --git a/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs b/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs new file mode 100644 index 0000000..d2f44db --- /dev/null +++ b/tests/Synercoding.FileFormats.Pdf.Tests/Content/ContentStreamTests.cs @@ -0,0 +1,119 @@ +using Synercoding.FileFormats.Pdf.Content; +using Synercoding.FileFormats.Pdf.Generation; +using Synercoding.FileFormats.Pdf.Generation.Internal; +using Synercoding.FileFormats.Pdf.Primitives; +using System.Text; + +namespace Synercoding.FileFormats.Pdf.Tests.Content; + +/// +/// Regression tests for issue #87 — CID-encoded show-text operands must survive the +/// serialiser unchanged. When literal strings were used, bytes containing 0x0D were +/// normalised to 0x0A by the PDF parser (ISO 32000-1 §7.3.4.2), which silently +/// shifted the CID lookup and produced wrong or missing glyphs. +/// +public class ContentStreamTests : IDisposable +{ + private readonly TableBuilder _tableBuilder; + private readonly CachedResources _cachedResources; + private readonly PageResources _pageResources; + private readonly ContentStream _contentStream; + + public ContentStreamTests() + { + _tableBuilder = new TableBuilder(); + _cachedResources = new CachedResources(_tableBuilder); + _pageResources = new PageResources(_tableBuilder, _cachedResources); + _contentStream = new ContentStream(_tableBuilder.ReserveId(), _pageResources); + } + + public void Dispose() + { + _contentStream.Dispose(); + _pageResources.Dispose(); + } + + [Fact] + public void ShowTextTj_GlyphId0x000D_WritesHexString() + { + // Reproduces the original issue #87 case: Source Sans Pro capital 'J' + // maps to glyph id 13 (0x000D). The bytes must round-trip verbatim. + _contentStream.ShowTextTj(new byte[] { 0x00, 0x0D }); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<000D>", written); + Assert.Contains("Tj", written); + } + + [Fact] + public void ShowTextTj_GlyphIdWithCarriageReturnInHighByte_WritesHexString() + { + _contentStream.ShowTextTj(new byte[] { 0x0D, 0x42 }); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<0D42>", written); + } + + [Fact] + public void ShowTextTj_ConsecutiveCidsForming0D0A_PreservesAlignment() + { + // A literal string would collapse 0D 0A to a single 0x0A, shifting + // alignment for every subsequent 2-byte CID from that point on. + _contentStream.ShowTextTj(new byte[] { 0x01, 0x0D, 0x0A, 0x02 }); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<010D0A02>", written); + } + + [Fact] + public void ShowTextTj_BytesMatchingLiteralDelimiters_AreEmittedAsHex() + { + // A glyph id whose byte encoding contains '(' / ')' / '\' was previously + // escaped for a literal string; hex strings write them verbatim. + _contentStream.ShowTextTj(new byte[] { 0x28, 0x29, 0x5C }); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<28295C>", written); + Assert.DoesNotContain("\\(", written); + Assert.DoesNotContain("\\)", written); + Assert.DoesNotContain("\\\\", written); + } + + [Fact] + public void ShowTextTj_DoesNotEmitLiteralStringDelimiters() + { + // Regression guard: the operand must no longer be wrapped in ( … ). + _contentStream.ShowTextTj(new byte[] { 0x00, 0x0D }); + + var rawData = _contentStream.InnerStream.ToStreamObject().RawData; + + Assert.DoesNotContain((byte)'(', rawData); + Assert.DoesNotContain((byte)')', rawData); + } + + [Fact] + public void MoveNextLineShowText_GlyphId0x000D_WritesHexString() + { + _contentStream.MoveNextLineShowText(new byte[] { 0x00, 0x0D }); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<000D>", written); + Assert.Contains("'", written); + } + + [Fact] + public void MoveNextLineShowText_WithSpacing_WritesHexString() + { + _contentStream.MoveNextLineShowText(new byte[] { 0x00, 0x0D }, wordSpacing: 1.0, characterSpacing: 2.0); + + var written = Encoding.ASCII.GetString(_contentStream.InnerStream.ToStreamObject().RawData); + + Assert.Contains("<000D>", written); + Assert.Contains("\"", written); + } +} diff --git a/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs b/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs index 533b840..1aa256a 100644 --- a/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs +++ b/tests/Synercoding.FileFormats.Pdf.Tests/Generation/PdfStreamTests.cs @@ -266,6 +266,151 @@ public void Test_ToStreamObject_WithThreeFilters_ReturnsStreamObjectWithFilterAr Assert.Equal(thirdEncoded, streamObject.RawData); } + // Regression tests for issue #87 — the literal-string escape table and the + // hex-string fallback used for CID-encoded show-text operands. + + [Fact] + public void WriteStringHex_EmptyArray_WritesEmptyAngleBrackets() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(Array.Empty()); + + Assert.Equal(new byte[] { 0x3C, 0x3E }, memoryStream.ToArray()); + } + + [Fact] + public void WriteStringHex_GlyphId0x000D_PreservesCarriageReturn() + { + // Issue #87: Source Sans Pro capital 'J' has glyph id 13 (0x000D). + // Previously this was written as a literal string, where the parser + // would silently normalise 0x0D to 0x0A and the consumer would look + // up CID 10 instead of 13. + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(new byte[] { 0x00, 0x0D }); + + Assert.Equal("<000D>", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringHex_GlyphIdWithCarriageReturnInHighByte_IsPreserved() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(new byte[] { 0x0D, 0x42 }); + + Assert.Equal("<0D42>", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringHex_ConsecutiveCidsForming0D0A_PreservesAlignment() + { + // A literal string would collapse the 0D 0A pair into a single 0x0A, + // shifting alignment for every subsequent 2-byte CID. A hex string + // round-trips every byte. + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(new byte[] { 0x01, 0x0D, 0x0A, 0x02 }); + + Assert.Equal("<010D0A02>", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringHex_BytesThatWouldBeEscapedInLiteral_AreWrittenRaw() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(new byte[] { 0x28, 0x29, 0x5C }); + + Assert.Equal("<28295C>", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringHex_AllBytes_RoundTripExactly() + { + var input = new byte[256]; + for (int i = 0; i < 256; i++) + input[i] = (byte)i; + + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringHex(input); + + var output = Encoding.ASCII.GetString(memoryStream.ToArray()); + Assert.StartsWith("<", output); + Assert.EndsWith(">", output); + Assert.Equal(( input.Length * 2 ) + 2, output.Length); + + // Parse the hex back and verify every byte round-trips. + var hex = output[1..^1]; + var roundTripped = new byte[input.Length]; + for (int i = 0; i < input.Length; i++) + roundTripped[i] = Convert.ToByte(hex.Substring(i * 2, 2), 16); + Assert.Equal(input, roundTripped); + } + + [Fact] + public void WriteStringLiteral_String_EscapesCarriageReturn() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringLiteral("a\rb"); + + Assert.Equal("(a\\rb)", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringLiteral_String_EscapesLineFeed() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringLiteral("a\nb"); + + Assert.Equal("(a\\nb)", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringLiteral_String_EscapesTab() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringLiteral("a\tb"); + + Assert.Equal("(a\\tb)", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringLiteral_String_StillEscapesParenthesesAndBackslash() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringLiteral("(a)\\b"); + + Assert.Equal("(\\(a\\)\\\\b)", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + + [Fact] + public void WriteStringLiteral_Bytes_EscapesControlCharacters() + { + using var memoryStream = new MemoryStream(); + var pdfStream = new PdfStream(memoryStream); + + pdfStream.WriteStringLiteral(new byte[] { 0x0D, 0x0A, 0x09, 0x08, 0x0C }); + + Assert.Equal("(\\r\\n\\t\\b\\f)", Encoding.ASCII.GetString(memoryStream.ToArray())); + } + private class PassThroughFilterStub : IStreamFilter { public PdfName Name => PdfName.Get("PassThrough");