From d90931d70a37eb2405f068db726b225e0e725903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Tue, 21 Apr 2026 16:41:13 +0000 Subject: [PATCH 1/2] GH-3516: Optimize DeltaByteArrayWriter / DeltaLengthByteArrayValuesWriter Two related changes in the DELTA_BYTE_ARRAY write path: 1. DeltaLengthByteArrayValuesWriter: drop the unused LittleEndianDataOutputStream wrapper. Binary.writeTo(arrayOut) works directly with the underlying CapacityByteArrayOutputStream; the LE wrapper added an extra layer of dispatch on every value but never used any LE functionality (writeInt/writeLong/etc.). Add a new writeBytes(byte[], int, int) overload so callers that already have the raw bytes can avoid allocating a Binary wrapper. 2. DeltaByteArrayWriter: tighten suffixWriter field type to DeltaLengthByteArrayValuesWriter (it's always constructed as one) so the new writeBytes(byte[], int, int) overload is callable. Replace the suffix call with the raw-bytes overload, eliminating the per-value Binary.slice() allocation. Benchmark (BinaryEncodingBenchmark, 100k BINARY values per invocation, JMH -wi 3 -i 5 -f 1): Benchmark Param Before (ops/s) After (ops/s) Improvement encodeDeltaByteArray LOW/10 61,475,818 81,416,754 +32% (1.32x) encodeDeltaByteArray LOW/100 34,759,755 45,186,617 +30% (1.30x) encodeDeltaByteArray LOW/1000 5,386,922 6,532,850 +21% (1.21x) encodeDeltaByteArray HIGH/10 56,799,595 78,966,929 +39% (1.39x) encodeDeltaLengthByteArray LOW/10 129,447,876 136,657,079 +6% encodeDeltaLengthByteArray HIGH/10 123,673,058 116,778,775 flat (noise) Negative controls (encodePlain, encodeDictionary): unchanged within noise. The DeltaByteArray path benefits most because it eliminates both the Binary.slice() allocation per suffix and the OutputStream dispatch layer. DeltaLengthByteArray gains are smaller since only the OutputStream wrapper removal applies there. No public API change. No file format change. All 573 parquet-column tests pass. --- .../DeltaLengthByteArrayValuesWriter.java | 20 ++++++++++--------- .../deltastrings/DeltaByteArrayWriter.java | 6 ++++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java index ac63ff52ef..f3c33dc417 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java @@ -22,7 +22,6 @@ import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; -import org.apache.parquet.bytes.LittleEndianDataOutputStream; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter; @@ -46,11 +45,9 @@ public class DeltaLengthByteArrayValuesWriter extends ValuesWriter { private ValuesWriter lengthWriter; private CapacityByteArrayOutputStream arrayOut; - private LittleEndianDataOutputStream out; public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBufferAllocator allocator) { arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, allocator); - out = new LittleEndianDataOutputStream(arrayOut); lengthWriter = new DeltaBinaryPackingValuesWriterForInteger( DeltaBinaryPackingValuesWriter.DEFAULT_NUM_BLOCK_VALUES, DeltaBinaryPackingValuesWriter.DEFAULT_NUM_MINIBLOCKS, @@ -63,12 +60,22 @@ public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBuffe public void writeBytes(Binary v) { try { lengthWriter.writeInteger(v.length()); - v.writeTo(out); + v.writeTo(arrayOut); } catch (IOException e) { throw new ParquetEncodingException("could not write bytes", e); } } + /** + * Writes raw bytes directly, avoiding Binary object creation overhead. + * Used by {@link org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter} + * to write suffix bytes without creating an intermediate Binary.slice(). + */ + public void writeBytes(byte[] data, int offset, int length) { + lengthWriter.writeInteger(length); + arrayOut.write(data, offset, length); + } + @Override public long getBufferedSize() { return lengthWriter.getBufferedSize() + arrayOut.size(); @@ -76,11 +83,6 @@ public long getBufferedSize() { @Override public BytesInput getBytes() { - try { - out.flush(); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page", e); - } LOG.debug("writing a buffer of size {}", arrayOut.size()); return BytesInput.concat(lengthWriter.getBytes(), BytesInput.from(arrayOut)); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java index c234108613..18b01bdf4f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java @@ -37,7 +37,7 @@ public class DeltaByteArrayWriter extends ValuesWriter { private ValuesWriter prefixLengthWriter; - private ValuesWriter suffixWriter; + private DeltaLengthByteArrayValuesWriter suffixWriter; private byte[] previous; public DeltaByteArrayWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) { @@ -95,7 +95,9 @@ public void writeBytes(Binary v) { for (i = 0; (i < length) && (previous[i] == vb[i]); i++) ; prefixLengthWriter.writeInteger(i); - suffixWriter.writeBytes(v.slice(i, vb.length - i)); + // Write suffix bytes directly from the byte array, avoiding Binary.slice() allocation + // and the virtual dispatch chain through Binary.writeTo() + suffixWriter.writeBytes(vb, i, vb.length - i); previous = vb; } } From f79466c16a80d5438d75d6e4cbd0d265d2c05b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Tue, 21 Apr 2026 16:52:52 +0000 Subject: [PATCH 2/2] GH-3520: Cleanup binary write path Two small cleanups on the binary write side: 1. DeltaByteArrayWriter: replace v.getBytes() with v.copy().getBytesUnsafe() to avoid the unconditional Arrays.copyOf that getBytes() performs for ByteArrayBackedBinary. copy() is a no-op for constant Binaries, and getBytesUnsafe() returns the backing array directly. For reused-buffer Binaries (e.g. ByteBufferBackedBinary over a slab being mutated), copy() still snapshots them so correctness is preserved. 2. FixedLenByteArrayPlainValuesWriter: drop the unused LittleEndianDataOutputStream wrapper (only used to call Binary.writeTo(), which works directly with the underlying CapacityByteArrayOutputStream). The trailing out.flush() in getBytes() is also dead. Same pattern as #3517 fixed in DeltaLengthByteArrayValuesWriter. No public API change. No file format change. Validation: parquet-column 573 tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true. --- .../values/deltastrings/DeltaByteArrayWriter.java | 5 ++++- .../plain/FixedLenByteArrayPlainValuesWriter.java | 10 +--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java index 18b01bdf4f..3afc0d3ace 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java @@ -89,7 +89,10 @@ public String memUsageString(String prefix) { @Override public void writeBytes(Binary v) { int i = 0; - byte[] vb = v.getBytes(); + // copy() is a no-op for constant (non-reused) Binaries, and getBytesUnsafe() + // returns the backing array directly for ByteArrayBackedBinary — avoiding + // the unconditional array copy that getBytes() always performs. + byte[] vb = v.copy().getBytesUnsafe(); int length = previous.length < vb.length ? previous.length : vb.length; // find the number of matching prefix bytes between this value and the previous one for (i = 0; (i < length) && (previous[i] == vb[i]); i++) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java index dec4d1be1b..c170ad8e90 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java @@ -22,7 +22,6 @@ import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; -import org.apache.parquet.bytes.LittleEndianDataOutputStream; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.io.ParquetEncodingException; @@ -37,7 +36,6 @@ public class FixedLenByteArrayPlainValuesWriter extends ValuesWriter { private static final Logger LOG = LoggerFactory.getLogger(PlainValuesWriter.class); private CapacityByteArrayOutputStream arrayOut; - private LittleEndianDataOutputStream out; private int length; private ByteBufferAllocator allocator; @@ -46,7 +44,6 @@ public FixedLenByteArrayPlainValuesWriter( this.length = length; this.allocator = allocator; this.arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, this.allocator); - this.out = new LittleEndianDataOutputStream(arrayOut); } @Override @@ -56,7 +53,7 @@ public final void writeBytes(Binary v) { "Fixed Binary size " + v.length() + " does not match field type length " + length); } try { - v.writeTo(out); + v.writeTo(arrayOut); } catch (IOException e) { throw new ParquetEncodingException("could not write fixed bytes", e); } @@ -69,11 +66,6 @@ public long getBufferedSize() { @Override public BytesInput getBytes() { - try { - out.flush(); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page", e); - } LOG.debug("writing a buffer of size {}", arrayOut.size()); return BytesInput.from(arrayOut); }