Skip to content

Commit 25a267e

Browse files
committed
add test for avro sanitization
1 parent 4cac691 commit 25a267e

1 file changed

Lines changed: 30 additions & 0 deletions

File tree

tests/test_schema.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,36 @@ def test_sanitize() -> None:
563563
assert sanitize_column_names(before_sanitized) == expected_schema
564564

565565

566+
def test_sanitize_special_chars() -> None:
567+
"""Test sanitizing schema with special characters in field names, using only StringType fields."""
568+
# Test names with special characters: numbers at start, dots, unicode, hash
569+
# Expected sanitized names: numbers prefixed with _, dots become _x2E, unicode becomes _x<hex>, hash becomes _x23
570+
names = ["9x", "x_", "a.b", "☃", "a#b"]
571+
expected_names = ["_9x", "x_", "a_x2Eb", "_x2603", "a_x23b"]
572+
573+
before_sanitized = Schema(
574+
NestedField(field_id=1, name=names[0], field_type=StringType(), required=True),
575+
NestedField(field_id=2, name=names[1], field_type=StringType(), required=True),
576+
NestedField(field_id=3, name=names[2], field_type=StringType(), required=True),
577+
NestedField(field_id=4, name=names[3], field_type=StringType(), required=True),
578+
NestedField(field_id=5, name=names[4], field_type=StringType(), required=True),
579+
schema_id=1,
580+
identifier_field_ids=[1],
581+
)
582+
583+
expected_schema = Schema(
584+
NestedField(field_id=1, name=expected_names[0], field_type=StringType(), required=True),
585+
NestedField(field_id=2, name=expected_names[1], field_type=StringType(), required=True),
586+
NestedField(field_id=3, name=expected_names[2], field_type=StringType(), required=True),
587+
NestedField(field_id=4, name=expected_names[3], field_type=StringType(), required=True),
588+
NestedField(field_id=5, name=expected_names[4], field_type=StringType(), required=True),
589+
schema_id=1,
590+
identifier_field_ids=[1],
591+
)
592+
593+
assert sanitize_column_names(before_sanitized) == expected_schema
594+
595+
566596
def test_prune_columns_string(table_schema_nested_with_struct_key_map: Schema) -> None:
567597
assert prune_columns(table_schema_nested_with_struct_key_map, {1}, False) == Schema(
568598
NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1]

0 commit comments

Comments
 (0)