Skip to content

Commit 41d7183

Browse files
Merge pull request #648 from apache/compressed_iterator
wrap(byte[])
2 parents 2dd1d5e + bde0b6f commit 41d7183

7 files changed

Lines changed: 512 additions & 2 deletions

File tree

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.datasketches.theta;
21+
22+
/*
23+
* This is to uncompress serial version 4 sketch incrementally
24+
*/
25+
class BytesCompactCompressedHashIterator implements HashIterator {
26+
private byte[] bytes;
27+
private int offset;
28+
private int entryBits;
29+
private int numEntries;
30+
private int index;
31+
private long previous;
32+
private int offsetBits;
33+
private long[] buffer;
34+
private boolean isBlockMode;
35+
36+
BytesCompactCompressedHashIterator(
37+
final byte[] bytes,
38+
final int offset,
39+
final int entryBits,
40+
final int numEntries
41+
) {
42+
this.bytes = bytes;
43+
this.offset = offset;
44+
this.entryBits = entryBits;
45+
this.numEntries = numEntries;
46+
index = -1;
47+
previous = 0;
48+
offsetBits = 0;
49+
buffer = new long[8];
50+
isBlockMode = numEntries >= 8;
51+
}
52+
53+
@Override
54+
public long get() {
55+
return buffer[index & 7];
56+
}
57+
58+
@Override
59+
public boolean next() {
60+
if (++index == numEntries) { return false; }
61+
if (isBlockMode) {
62+
if ((index & 7) == 0) {
63+
if (numEntries - index >= 8) {
64+
unpack8();
65+
} else {
66+
isBlockMode = false;
67+
unpack1();
68+
}
69+
}
70+
} else {
71+
unpack1();
72+
}
73+
return true;
74+
}
75+
76+
private void unpack1() {
77+
final int i = index & 7;
78+
BitPacking.unpackBits(buffer, i, entryBits, bytes, offset, offsetBits);
79+
offset += (offsetBits + entryBits) >>> 3;
80+
offsetBits = (offsetBits + entryBits) & 7;
81+
buffer[i] += previous;
82+
previous = buffer[i];
83+
}
84+
85+
private void unpack8() {
86+
BitPacking.unpackBitsBlock8(buffer, 0, bytes, offset, entryBits);
87+
offset += entryBits;
88+
for (int i = 0; i < 8; i++) {
89+
buffer[i] += previous;
90+
previous = buffer[i];
91+
}
92+
}
93+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.datasketches.theta;
21+
22+
import org.apache.datasketches.common.ByteArrayUtil;
23+
24+
/*
25+
* This is to iterate over serial version 3 sketch representation
26+
*/
27+
class BytesCompactHashIterator implements HashIterator {
28+
final private byte[] bytes;
29+
final private int offset;
30+
final private int numEntries;
31+
private int index;
32+
33+
BytesCompactHashIterator(
34+
final byte[] bytes,
35+
final int offset,
36+
final int numEntries
37+
) {
38+
this.bytes = bytes;
39+
this.offset = offset;
40+
this.numEntries = numEntries;
41+
index = -1;
42+
}
43+
44+
@Override
45+
public long get() {
46+
return ByteArrayUtil.getLongLE(bytes, offset + index * Long.BYTES);
47+
}
48+
49+
@Override
50+
public boolean next() {
51+
return ++index < numEntries;
52+
}
53+
}

src/main/java/org/apache/datasketches/theta/CompactSketch.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,15 @@
1919

2020
package org.apache.datasketches.theta;
2121

22+
import static org.apache.datasketches.common.ByteArrayUtil.getShortLE;
2223
import static org.apache.datasketches.common.Family.idToFamily;
2324
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
2425
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
26+
import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE;
2527
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
28+
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
2629
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
30+
import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
2731
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
2832
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
2933
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
@@ -224,6 +228,56 @@ else if (serVer == 2) {
224228
"Corrupted: Serialization Version " + serVer + " not recognized.");
225229
}
226230

231+
public static CompactSketch wrap(final byte[] bytes) {
232+
return wrap(bytes, ThetaUtil.DEFAULT_UPDATE_SEED, false);
233+
}
234+
235+
public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) {
236+
return wrap(bytes, expectedSeed, true);
237+
}
238+
239+
private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) {
240+
final int serVer = bytes[PreambleUtil.SER_VER_BYTE];
241+
final int familyId = bytes[PreambleUtil.FAMILY_BYTE];
242+
final Family family = Family.idToFamily(familyId);
243+
if (family != Family.COMPACT) {
244+
throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
245+
}
246+
final short seedHash = ThetaUtil.computeSeedHash(seed);
247+
if (serVer == 4) {
248+
return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash);
249+
} else if (serVer == 3) {
250+
final int flags = bytes[FLAGS_BYTE];
251+
if ((flags & EMPTY_FLAG_MASK) > 0) {
252+
return EmptyCompactSketch.getHeapInstance(Memory.wrap(bytes));
253+
}
254+
final int preLongs = bytes[PREAMBLE_LONGS_BYTE];
255+
if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) {
256+
return SingleItemSketch.heapify(Memory.wrap(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT));
257+
}
258+
//not empty & not singleItem
259+
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
260+
if (!compactFlag) {
261+
throw new SketchesArgumentException(
262+
"Corrupted: COMPACT family sketch image must have compact flag set");
263+
}
264+
final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
265+
if (!readOnly) {
266+
throw new SketchesArgumentException(
267+
"Corrupted: COMPACT family sketch image must have Read-Only flag set");
268+
}
269+
return WrappedCompactSketch.wrapInstance(bytes,
270+
enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT));
271+
} else if (serVer == 1) {
272+
return ForwardCompatibility.heapify1to3(Memory.wrap(bytes), seedHash);
273+
} else if (serVer == 2) {
274+
return ForwardCompatibility.heapify2to3(Memory.wrap(bytes),
275+
enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT));
276+
}
277+
throw new SketchesArgumentException(
278+
"Corrupted: Serialization Version " + serVer + " not recognized.");
279+
}
280+
227281
//Sketch Overrides
228282

229283
@Override

src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
/**
3737
* An off-heap (Direct), compact, read-only sketch. The internal hash array can be either ordered
38-
* or unordered.
38+
* or unordered. It is not empty, not a single item.
3939
*
4040
* <p>This sketch can only be associated with a Serialization Version 3 format binary image.</p>
4141
*
@@ -57,7 +57,7 @@ class DirectCompactSketch extends CompactSketch {
5757
}
5858

5959
/**
60-
* Wraps the given Memory, which must be a SerVer 3, ordered, CompactSketch image.
60+
* Wraps the given Memory, which must be a SerVer 3, CompactSketch image.
6161
* Must check the validity of the Memory before calling. The order bit must be set properly.
6262
* @param srcMem <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
6363
* @param seedHash The update seedHash.
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.datasketches.theta;
21+
22+
import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits;
23+
import static org.apache.datasketches.theta.PreambleUtil.ENTRY_BITS_BYTE_V4;
24+
import static org.apache.datasketches.theta.PreambleUtil.NUM_ENTRIES_BYTES_BYTE_V4;
25+
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
26+
27+
import org.apache.datasketches.common.ByteArrayUtil;
28+
import org.apache.datasketches.thetacommon.ThetaUtil;
29+
30+
/**
31+
* Wrapper around a serialized compact compressed read-only sketch. It is not empty, not a single item.
32+
*
33+
* <p>This sketch can only be associated with a Serialization Version 4 format binary image.</p>
34+
*/
35+
class WrappedCompactCompressedSketch extends WrappedCompactSketch {
36+
37+
/**
38+
* Construct this sketch with the given bytes.
39+
* @param bytes containing serialized compact compressed sketch.
40+
*/
41+
WrappedCompactCompressedSketch(final byte[] bytes) {
42+
super(bytes);
43+
}
44+
45+
/**
46+
* Wraps the given bytes, which must be a SerVer 4 compressed CompactSketch image.
47+
* @param bytes representation of serialized compressed compact sketch.
48+
* @param seedHash The update seedHash.
49+
* <a href="{@docRoot}/resources/dictionary.html#seedHash">See Seed Hash</a>.
50+
* @return this sketch
51+
*/
52+
static WrappedCompactCompressedSketch wrapInstance(final byte[] bytes, final short seedHash) {
53+
ThetaUtil.checkSeedHashes(ByteArrayUtil.getShortLE(bytes, PreambleUtil.SEED_HASH_SHORT), seedHash);
54+
return new WrappedCompactCompressedSketch(bytes);
55+
}
56+
57+
//Sketch Overrides
58+
59+
@Override
60+
public int getCurrentBytes() {
61+
final int preLongs = bytes_[PREAMBLE_LONGS_BYTE];
62+
final int entryBits = bytes_[ENTRY_BITS_BYTE_V4];
63+
final int numEntriesBytes = bytes_[NUM_ENTRIES_BYTES_BYTE_V4];
64+
return preLongs * Long.BYTES + numEntriesBytes + wholeBytesToHoldBits(getRetainedEntries() * entryBits);
65+
}
66+
67+
private static final int START_PACKED_DATA_EXACT_MODE = 8;
68+
private static final int START_PACKED_DATA_ESTIMATION_MODE = 16;
69+
70+
@Override
71+
public int getRetainedEntries(final boolean valid) { //compact is always valid
72+
// number of entries is stored using variable length encoding
73+
// most significant bytes with all zeros are not stored
74+
// one byte in the preamble has the number of non-zero bytes used
75+
final int preLongs = bytes_[PREAMBLE_LONGS_BYTE]; // if > 1 then the second long has theta
76+
final int numEntriesBytes = bytes_[NUM_ENTRIES_BYTES_BYTE_V4];
77+
int offsetBytes = preLongs > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE;
78+
int numEntries = 0;
79+
for (int i = 0; i < numEntriesBytes; i++) {
80+
numEntries |= Byte.toUnsignedInt(bytes_[offsetBytes++]) << (i << 3);
81+
}
82+
return numEntries;
83+
}
84+
85+
@Override
86+
public long getThetaLong() {
87+
final int preLongs = bytes_[PREAMBLE_LONGS_BYTE];
88+
return (preLongs > 1) ? ByteArrayUtil.getLongLE(bytes_, 8) : Long.MAX_VALUE;
89+
}
90+
91+
@Override
92+
public boolean isEmpty() {
93+
return false;
94+
}
95+
96+
@Override
97+
public boolean isOrdered() {
98+
return true;
99+
}
100+
101+
@Override
102+
public HashIterator iterator() {
103+
return new BytesCompactCompressedHashIterator(
104+
bytes_,
105+
(bytes_[PREAMBLE_LONGS_BYTE] > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE)
106+
+ bytes_[NUM_ENTRIES_BYTES_BYTE_V4],
107+
bytes_[ENTRY_BITS_BYTE_V4],
108+
getRetainedEntries()
109+
);
110+
}
111+
}

0 commit comments

Comments
 (0)