|
41 | 41 | Or, |
42 | 42 | StartsWith, |
43 | 43 | ) |
44 | | -from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator |
| 44 | +from pyiceberg.expressions.visitors import ( |
| 45 | + ROWS_CANNOT_MATCH, |
| 46 | + ROWS_MIGHT_MATCH, |
| 47 | + ROWS_MIGHT_NOT_MATCH, |
| 48 | + ROWS_MUST_MATCH, |
| 49 | + _InclusiveMetricsEvaluator, |
| 50 | + _StrictMetricsEvaluator, |
| 51 | +) |
45 | 52 | from pyiceberg.manifest import DataFile, FileFormat |
46 | 53 | from pyiceberg.schema import Schema |
47 | 54 | from pyiceberg.typedef import Record |
|
50 | 57 | FloatType, |
51 | 58 | IcebergType, |
52 | 59 | IntegerType, |
| 60 | + LongType, |
53 | 61 | NestedField, |
54 | 62 | PrimitiveType, |
55 | 63 | StringType, |
@@ -1463,3 +1471,81 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file |
1463 | 1471 |
|
1464 | 1472 | should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1) |
1465 | 1473 | assert not should_read, "Should not match: no_nulls field does not have bounds" |
| 1474 | + |
| 1475 | + |
| 1476 | +@pytest.mark.parametrize( |
| 1477 | + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", |
| 1478 | + [ |
| 1479 | + # Int -> Long |
| 1480 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH), |
| 1481 | + (IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH), |
| 1482 | + # Float -> Double |
| 1483 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH), |
| 1484 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH), |
| 1485 | + ], |
| 1486 | +) |
| 1487 | +def test_inclusive_metrics_eval_bounds_after_promotion( |
| 1488 | + file_type: PrimitiveType, |
| 1489 | + evolved_type: PrimitiveType, |
| 1490 | + lower_bound: Any, |
| 1491 | + upper_bound: Any, |
| 1492 | + op: Any, |
| 1493 | + lit: Any, |
| 1494 | + expected: bool, |
| 1495 | +) -> None: |
| 1496 | + schema = Schema(NestedField(1, "col", evolved_type, required=True)) |
| 1497 | + |
| 1498 | + data_file = DataFile.from_args( |
| 1499 | + file_path="file_1.parquet", |
| 1500 | + file_format=FileFormat.PARQUET, |
| 1501 | + partition={}, |
| 1502 | + record_count=100, |
| 1503 | + file_size_in_bytes=1024, |
| 1504 | + lower_bounds={1: to_bytes(file_type, lower_bound)}, |
| 1505 | + upper_bounds={1: to_bytes(file_type, upper_bound)}, |
| 1506 | + ) |
| 1507 | + |
| 1508 | + evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit)) |
| 1509 | + assert evaluator.eval(data_file) == expected |
| 1510 | + |
| 1511 | + |
| 1512 | +@pytest.mark.parametrize( |
| 1513 | + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", |
| 1514 | + [ |
| 1515 | + # Int -> Long |
| 1516 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH), |
| 1517 | + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH), |
| 1518 | + (IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH), |
| 1519 | + (IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH), |
| 1520 | + # Float -> Double |
| 1521 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH), |
| 1522 | + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH), |
| 1523 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH), |
| 1524 | + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH), |
| 1525 | + ], |
| 1526 | +) |
| 1527 | +def test_strict_metrics_eval_bounds_after_promotion( |
| 1528 | + file_type: PrimitiveType, |
| 1529 | + evolved_type: PrimitiveType, |
| 1530 | + lower_bound: Any, |
| 1531 | + upper_bound: Any, |
| 1532 | + op: Any, |
| 1533 | + lit: Any, |
| 1534 | + expected: bool, |
| 1535 | +) -> None: |
| 1536 | + schema = Schema(NestedField(1, "col", evolved_type, required=True)) |
| 1537 | + |
| 1538 | + data_file = DataFile.from_args( |
| 1539 | + file_path="file_1.parquet", |
| 1540 | + file_format=FileFormat.PARQUET, |
| 1541 | + partition={}, |
| 1542 | + record_count=100, |
| 1543 | + file_size_in_bytes=1024, |
| 1544 | + lower_bounds={1: to_bytes(file_type, lower_bound)}, |
| 1545 | + upper_bounds={1: to_bytes(file_type, upper_bound)}, |
| 1546 | + null_value_counts={1: 0}, |
| 1547 | + nan_value_counts={1: 0}, |
| 1548 | + ) |
| 1549 | + |
| 1550 | + evaluator = _StrictMetricsEvaluator(schema, op("col", lit)) |
| 1551 | + assert evaluator.eval(data_file) == expected |
0 commit comments