Skip to content

Commit 8c66f20

Browse files
committed
Lots of cleanup after Sketches class was removed.
Fixed some very bad assumptions with heapify(...) and wrap(...) where if the simple call was made, i.e., heapify(seg) not only was the default update seed assumed (this is normal), but the seed hash was not checked! Yikes! Same thing with wrap(...). I'm not sure how that ever happened, but it is now fixed. I think a lot of that was associated with the old SerVer 1 and 2 stuff. This also fixed the squirrelly testing that somehow figured that the above was ok!?! Also added some checks when reading the preamble of a user's input segment -- made sure that the segment was at least large enough to hold the full preamble.
1 parent 5c3b845 commit 8c66f20

16 files changed

Lines changed: 127 additions & 188 deletions

src/main/java/org/apache/datasketches/theta/CompactOperations.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
3030
import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
3131
import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
32+
import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap;
3233
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
3334
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
3435
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
@@ -121,7 +122,7 @@ static CompactSketch segmentToCompact(
121122
final MemorySegment dstWSeg)
122123
{
123124
//extract Pre0 fields and Flags from srcMem
124-
final int srcPreLongs = Sketch.getPreambleLongs(srcSeg);
125+
final int srcPreLongs = checkSegPreambleCap(srcSeg);
125126
final int srcSerVer = extractSerVer(srcSeg); //not used
126127
final int srcFamId = extractFamilyID(srcSeg);
127128
final int srcLgArrLongs = extractLgArrLongs(srcSeg);
@@ -136,7 +137,7 @@ static CompactSketch segmentToCompact(
136137
final boolean srcSingleFlag = (srcFlags & SINGLEITEM_FLAG_MASK) > 0;
137138

138139
final boolean single = srcSingleFlag
139-
|| SingleItemSketch.otherCheckForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags);
140+
|| SingleItemSketch.checkForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags);
140141

141142
//extract pre1 and pre2 fields
142143
final int curCount = single ? 1 : (srcPreLongs > 1) ? extractCurCount(srcSeg) : 0;
@@ -318,7 +319,7 @@ static long[] compactCache(final long[] srcCache, final int curCount,
318319
* This is checked in all compacting operations.
319320
* 7 <1.0 !0 F OK This corresponds to a sketch in estimation mode
320321
* </pre>
321-
* #4 is handled by <i>correctThetaOnCompat(boolean, int)</i> (below).
322+
* #4 is handled by <i>correctThetaOnCompact(boolean, int)</i> (below).
322323
* #2 & #6 handled by <i>checkIllegalCurCountAndEmpty(boolean, int)</i>
323324
*/
324325

src/main/java/org/apache/datasketches/theta/CompactSketch.java

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,13 @@
2222
import static java.lang.foreign.ValueLayout.JAVA_BYTE;
2323
import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED;
2424
import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED;
25-
import static org.apache.datasketches.common.ByteArrayUtil.getShortLE;
2625
import static org.apache.datasketches.common.Family.idToFamily;
2726
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
2827
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
2928
import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE;
3029
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
3130
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
3231
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
33-
import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
3432
import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4;
3533
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
3634
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
@@ -39,7 +37,7 @@
3937
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
4038
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4;
4139
import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits;
42-
import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem;
40+
import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem;
4341

4442
import java.lang.foreign.MemorySegment;
4543

@@ -160,7 +158,7 @@ public static CompactSketch wrap(final MemorySegment srcSeg, final long expected
160158
if (PreambleUtil.isEmptyFlag(srcSeg)) {
161159
return EmptyCompactSketch.getHeapInstance(srcSeg);
162160
}
163-
if (otherCheckForSingleItem(srcSeg)) {
161+
if (checkForSingleItem(srcSeg)) {
164162
return SingleItemSketch.heapify(srcSeg, seedHash);
165163
}
166164
//not empty & not singleItem
@@ -190,72 +188,57 @@ public static CompactSketch wrap(final MemorySegment srcSeg, final long expected
190188
* There is no data copying onto the java heap.
191189
* The wrap operation enables fast read-only merging and access to all the public read-only API.
192190
*
193-
* <p>Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have
194-
* been explicitly stored as direct sketches can be wrapped.
195-
* Wrapping earlier serial version sketches will result in a heapify operation.
196-
* These early versions were never designed to "wrap".</p>
191+
* <p>Only sketches that have been explicitly stored as direct sketches can be wrapped.</p>
197192
*
198193
* <p>Wrapping any subclass of this class that is empty or contains only a single item will
199194
* result in heapified forms of empty and single item sketch respectively.
200195
* This is actually faster and consumes less overall space.</p>
201196
*
202-
* <p>This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.
203-
* Note that SerialVersion 1 (pre-open-source) sketches cannot be checked as they don't have a seedHash field,
204-
* so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.</p>
197+
* <p>This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.</p>
205198
*
206199
* @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED.
207200
*
208201
* @return a CompactSketch backed by the given byte array except as above.
209202
*/
210203
public static CompactSketch wrap(final byte[] bytes) {
211-
return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false);
204+
return wrap(bytes, Util.DEFAULT_UPDATE_SEED);
212205
}
213206

214207
/**
215208
* Wrap takes the sketch image in the given byte array and refers to it directly.
216209
* There is no data copying onto the java heap.
217210
* The wrap operation enables fast read-only merging and access to all the public read-only API.
218211
*
219-
* <p>Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have
220-
* been explicitly stored as direct sketches can be wrapped.
221-
* Wrapping earlier serial version sketches will result in a heapify operation.
222-
* These early versions were never designed to "wrap".</p>
212+
* <p>Only sketches that have been explicitly stored as direct sketches can be wrapped.</p>
223213
*
224214
* <p>Wrapping any subclass of this class that is empty or contains only a single item will
225215
* result in heapified forms of empty and single item sketch respectively.
226216
* This is actually faster and consumes less overall space.</p>
227217
*
228-
* <p>This method checks if the given expectedSeed was used to create the source byte array image.
229-
* Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field,
230-
* so the resulting heapified CompactSketch will be given the hash of the expectedSeed.</p>
218+
* <p>This method checks if the given expectedSeed was used to create the source byte array image.</p>
231219
*
232220
* @param bytes a byte array image of a Sketch that was created using the given expectedSeed.
233221
* @param expectedSeed the seed used to validate the given byte array image.
234222
* <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
235223
* @return a CompactSketch backed by the given byte array except as above.
236224
*/
237225
public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) {
238-
return wrap(bytes, expectedSeed, true);
239-
}
240-
241-
private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) {
242226
final int serVer = bytes[PreambleUtil.SER_VER_BYTE];
243227
final int familyId = bytes[PreambleUtil.FAMILY_BYTE];
244228
final Family family = Family.idToFamily(familyId);
245229
if (family != Family.COMPACT) {
246230
throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!");
247231
}
248-
final short seedHash = Util.computeSeedHash(seed);
249-
232+
final short seedHash = Util.computeSeedHash(expectedSeed);
250233

251234
if (serVer == 3) {
252235
final int flags = bytes[FLAGS_BYTE];
253236
if ((flags & EMPTY_FLAG_MASK) > 0) {
254237
return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes));
255238
}
256239
final int preLongs = bytes[PREAMBLE_LONGS_BYTE];
257-
if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) {
258-
return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT));
240+
if (checkForSingleItem(preLongs, serVer, familyId, flags)) {
241+
return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), seedHash);
259242
}
260243
//not empty & not singleItem
261244
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
@@ -268,17 +251,14 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo
268251
throw new SketchesArgumentException(
269252
"Corrupted: COMPACT family sketch image must have Read-Only flag set");
270253
}
271-
return WrappedCompactSketch.wrapInstance(bytes,
272-
enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT));
254+
return WrappedCompactSketch.wrapInstance(bytes, seedHash);
273255
}
274256
if (serVer ==4) {
275257
return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash);
276258
}
277259
//not SerVer 3 or 4
278260
throw new SketchesArgumentException(
279261
"Corrupted: Serialization Version " + serVer + " not recognized.");
280-
281-
282262
}
283263

284264
//Sketch Overrides

src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public int getCurrentBytes() {
8484
private static final int START_PACKED_DATA_ESTIMATION_MODE = 16;
8585

8686
@Override
87-
public int getRetainedEntries(final boolean valid) { //compact is always valid
87+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
8888
// number of entries is stored using variable length encoding
8989
// most significant bytes with all zeros are not stored
9090
// one byte in the preamble has the number of non-zero bytes used

src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
2929
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
3030
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
31-
import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem;
31+
import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem;
3232

3333
import java.lang.foreign.MemorySegment;
3434

@@ -80,15 +80,15 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe
8080

8181
@Override
8282
public int getCurrentBytes() {
83-
if (otherCheckForSingleItem(seg_)) { return 16; }
83+
if (checkForSingleItem(seg_)) { return 16; }
8484
final int preLongs = Sketch.getPreambleLongs(seg_);
8585
final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_);
8686
return (preLongs + curCount) << 3;
8787
}
8888

8989
@Override
90-
public int getRetainedEntries(final boolean valid) { //compact is always valid
91-
if (otherCheckForSingleItem(seg_)) { return 1; }
90+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
91+
if (checkForSingleItem(seg_)) { return 1; }
9292
final int preLongs = Sketch.getPreambleLongs(seg_);
9393
return (preLongs == 1) ? 0 : extractCurCount(seg_);
9494
}
@@ -146,7 +146,7 @@ public byte[] toByteArray() {
146146

147147
@Override
148148
long[] getCache() {
149-
if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; }
149+
if (checkForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; }
150150
final int preLongs = Sketch.getPreambleLongs(seg_);
151151
final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_);
152152
if (curCount > 0) {

src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ static DirectQuickSelectSketch writableWrap(
179179
final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3
180180
final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4
181181

182-
UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
182+
UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
183183
checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs);
184184

185185
if (isResizeFactorIncorrect(srcSeg, lgNomLongs, lgArrLongs)) {

src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact;
2929
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
3030
import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE;
31-
//import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE;
3231
import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT;
3332
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
3433
import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT;
@@ -38,6 +37,7 @@
3837
import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs;
3938
import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs;
4039
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
40+
import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap;
4141
import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong;
4242

4343
import java.lang.foreign.MemorySegment;
@@ -102,12 +102,11 @@ private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) {
102102
* @return instance of this sketch
103103
*/
104104
static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) {
105-
final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0
105+
final int preambleLongs = checkSegPreambleCap(srcSeg); //byte 0
106106
final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3
107107
final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4
108-
109-
UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
110108
checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs);
109+
UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
111110
return new DirectQuickSelectSketchR(seed, srcSeg);
112111
}
113112

@@ -147,7 +146,7 @@ public Family getFamily() {
147146
}
148147

149148
@Override
150-
public int getRetainedEntries(final boolean valid) { //always valid for theta
149+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
151150
return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT);
152151
}
153152

src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public int getCurrentBytes() {
8686
public double getEstimate() { return 0; }
8787

8888
@Override
89-
public int getRetainedEntries(final boolean valid) {
89+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
9090
return 0;
9191
}
9292

src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ public double getLowerBound(final int numStdDev) {
208208
}
209209

210210
@Override
211-
public int getRetainedEntries(final boolean valid) {
211+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
212212
if (curCount_ > 0) {
213213
if (valid && isDirty()) {
214214
return HashOperations.countPart(getCache(), getLgArrLongs(), getThetaLong());

src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ public int getCurrentBytes() {
8888
}
8989

9090
@Override
91-
public int getRetainedEntries(final boolean valid) {
91+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
9292
return curCount_;
9393
}
9494

src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final l
111111
final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3
112112
final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4
113113

114-
checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
114+
checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs);
115115
checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs);
116116

117117
final float p = extractP(srcSeg); //bytes 12-15
@@ -149,7 +149,7 @@ public Family getFamily() {
149149
}
150150

151151
@Override
152-
public int getRetainedEntries(final boolean valid) {
152+
public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
153153
return curCount_;
154154
}
155155

0 commit comments

Comments
 (0)