Skip to content

Commit 43bdcd6

Browse files
JoshRosenclaude
andauthored
fix: handle non-BMP Unicode codepoints in foldl, foldr, and %c format (#606)
This PR fixes two more non-BMP Unicode bugs: - foldl/foldr iterated strings by UTF-16 code unit (`for (char <- s.value)`), splitting non-BMP characters like emoji into surrogate pair halves. Use `codePointAt`/`codePointBefore` with `Character.charCount` for correct codepoint iteration. - The `%c` format conversion used `s.toChar.toString` which truncates codepoints above U+FFFF to 16 bits. Use `Character.toString(s.toInt)` instead. --- All code written by Claude Opus 4.6. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8430930 commit 43bdcd6

3 files changed

Lines changed: 42 additions & 5 deletions

File tree

sjsonnet/src/sjsonnet/Format.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ object Format {
196196
case 'f' | 'F' => formatFloat(formatted, s)
197197
case 'g' => formatGeneric(formatted, s).toLowerCase
198198
case 'G' => formatGeneric(formatted, s)
199-
case 'c' => widenRaw(formatted, s.toChar.toString)
199+
case 'c' => widenRaw(formatted, Character.toString(s.toInt))
200200
case 's' =>
201201
if (s.toLong == s) widenRaw(formatted, s.toLong.toString)
202202
else widenRaw(formatted, s.toString)

sjsonnet/src/sjsonnet/stdlib/ArrayModule.scala

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -296,12 +296,16 @@ object ArrayModule extends AbstractFunctionModule {
296296

297297
case s: Val.Str =>
298298
var current = init.force
299-
for (char <- s.value) {
299+
val str = s.value
300+
var i = 0
301+
while (i < str.length) {
300302
val c = current
301-
current = func.apply2(c, Val.Str(pos, new String(Array(char))), pos.noOffset)(
303+
val codePoint = str.codePointAt(i)
304+
current = func.apply2(c, Val.Str(pos, Character.toString(codePoint)), pos.noOffset)(
302305
ev,
303306
TailstrictModeDisabled
304307
)
308+
i += Character.charCount(codePoint)
305309
}
306310
current
307311

@@ -324,9 +328,13 @@ object ArrayModule extends AbstractFunctionModule {
324328
current
325329
case s: Val.Str =>
326330
var current = init.force
327-
for (char <- s.value.reverse) {
331+
val str = s.value
332+
var i = str.length
333+
while (i > 0) {
334+
val codePoint = str.codePointBefore(i)
335+
i -= Character.charCount(codePoint)
328336
val c = current
329-
current = func.apply2(Val.Str(pos, new String(Array(char))), c, pos.noOffset)(
337+
current = func.apply2(Val.Str(pos, Character.toString(codePoint)), c, pos.noOffset)(
330338
ev,
331339
TailstrictModeDisabled
332340
)

sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,5 +256,34 @@ object UnicodeHandlingTests extends TestSuite {
256256
eval("""std.trim("🌍 ")""") ==> ujson.Str("🌍")
257257
eval("""std.trim(" 🌍 ")""") ==> ujson.Str("🌍")
258258
}
259+
260+
test("foldl") {
261+
// foldl must iterate by codepoint, not UTF-16 code unit
262+
eval("""std.foldl(function(acc, c) acc + [c], "a😀b", [])""") ==>
263+
ujson.Arr("a", "😀", "b")
264+
eval("""std.foldl(function(acc, c) acc + 1, "a😀b", 0)""") ==> ujson.Num(3)
265+
eval("""std.foldl(function(acc, c) acc + [c], "🎉🔥", [])""") ==>
266+
ujson.Arr("🎉", "🔥")
267+
// Round-trip concatenation
268+
eval("""std.foldl(function(acc, c) acc + c, "a😀b", "")""") ==> ujson.Str("a😀b")
269+
}
270+
271+
test("foldr") {
272+
// foldr must iterate by codepoint, not UTF-16 code unit
273+
eval("""std.foldr(function(c, acc) acc + [c], "a😀b", [])""") ==>
274+
ujson.Arr("b", "😀", "a")
275+
eval("""std.foldr(function(c, acc) acc + [c], "🎉🔥", [])""") ==>
276+
ujson.Arr("🔥", "🎉")
277+
// Round-trip concatenation (right-to-left: 'b' then '😀' then 'a')
278+
eval("""std.foldr(function(c, acc) acc + c, "a😀b", "")""") ==> ujson.Str("b😀a")
279+
}
280+
281+
test("formatPercentC") {
282+
// %c must handle non-BMP codepoints
283+
eval("""std.format("%c", [128512])""") ==> ujson.Str("😀") // U+1F600
284+
eval("""std.format("%c", [128293])""") ==> ujson.Str("🔥") // U+1F525
285+
eval("""std.format("%c", [127757])""") ==> ujson.Str("🌍") // U+1F30D
286+
eval("""std.format("%c", [65])""") ==> ujson.Str("A") // BMP char
287+
}
259288
}
260289
}

0 commit comments

Comments
 (0)