From d9888b3859bf2df4c71d008e2f93135c3dafeff1 Mon Sep 17 00:00:00 2001 From: ByteWise <156506452+2830500285@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:40:51 -0700 Subject: [PATCH 1/2] fix: respect token limit when merging text chunks --- dotnet/src/SemanticKernel.Core/Text/TextChunker.cs | 10 +++------- .../SemanticKernel.UnitTests/Text/TextChunkerTests.cs | 11 +++++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index d8f4a32b4e3c..c51873290f70 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -195,15 +195,11 @@ private static List ProcessParagraphs(List paragraphs, int adjus var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); - var lastParagraphTokensCount = lastParagraphTokens.Length; - var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + var mergedParagraph = $"{string.Join(" ", secondLastParagraphTokens)} {string.Join(" ", lastParagraphTokens)}"; - if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + if (GetTokenCount(mergedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph) { - var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); - var newLastParagraph = string.Join(" ", lastParagraphTokens); - - paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}"; + paragraphs[paragraphs.Count - 2] = mergedParagraph; paragraphs.RemoveAt(paragraphs.Count - 1); } } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a31f077eef66..7094443944a3 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -558,6 +558,17 @@ public void CanSplitTextParagraphsWithCustomTokenCounter() Assert.Equal(expected, result); } + [Fact] + public void SplitTextParagraphsDoesNotMergeShortLastParagraphPastTokenLimit() + { + var input = new[] { "123456789", "x" }; + + var result = TextChunker.SplitPlainTextParagraphs(input, 10, tokenCounter: input => input.Length); + + Assert.Equal(["123456789", "x"], result); + Assert.All(result, paragraph => Assert.True(paragraph.Length <= 10, $"Paragraph exceeded token limit: {paragraph}")); + } + [Fact] public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter() { From b42ee5492ea0915ef551d1757ddbd7784bd43910 Mon Sep 17 00:00:00 2001 From: ByteWise <156506452+2830500285@users.noreply.github.com> Date: Fri, 12 Jun 2026 19:44:38 -0700 Subject: [PATCH 2/2] test: clarify text chunker token counter --- dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index 7094443944a3..fd53dedf1504 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -563,7 +563,7 @@ public void SplitTextParagraphsDoesNotMergeShortLastParagraphPastTokenLimit() { var input = new[] { "123456789", "x" }; - var result = TextChunker.SplitPlainTextParagraphs(input, 10, tokenCounter: input => input.Length); + var result = TextChunker.SplitPlainTextParagraphs(input, 10, tokenCounter: text => text.Length); Assert.Equal(["123456789", "x"], result); Assert.All(result, paragraph => Assert.True(paragraph.Length <= 10, $"Paragraph exceeded token limit: {paragraph}"));