|
| 1 | +import re |
| 2 | +import subprocess |
| 3 | +from typing import Union |
| 4 | +import os |
| 5 | +import warnings |
| 6 | + |
| 7 | +import polars as pl |
| 8 | + |
| 9 | + |
| 10 | +CREATE_TABLE_RE = re.compile("CREATE TABLE \[([^]]+)\]\s+\((.*?\));", |
| 11 | + re.MULTILINE | re.DOTALL) |
| 12 | + |
| 13 | +DATA_TYPE_DEF_RE = re.compile(r"^\s*\[(?P<column_name>[^\]]+)\]\s*(?P<data_type>[A-Za-z]+[^,]+),?") |
| 14 | + |
| 15 | +def list_table_names(rdb_file: Union[str, os.PathLike], encoding: str = 'utf-8') -> list[str]: |
| 16 | + """ |
| 17 | + Lists the names of the tables in a given database using 'mdb-tables'. |
| 18 | +
|
| 19 | + :param rdb_file: The MS Access database file. |
| 20 | + :param encoding: The content encoding of the output of the mdb-tables command. |
| 21 | + :return: A list of the tables in a given database. |
| 22 | + """ |
| 23 | + tables = subprocess.check_output(['mdb-tables', "--single-column", rdb_file]).decode(encoding) |
| 24 | + return tables.strip().split("\n") |
| 25 | + |
| 26 | + |
| 27 | +def _convert_data_type_from_access_to_polars(data_type: str) -> Union[pl.DataType, None]: |
| 28 | + # Source: https://github.com/mdbtools/mdbtools/blob/0e77b68e76701ddc7aacb2c2e10ecdad1bb530ec/src/libmdb/backend.c#L27 |
| 29 | + data_type = data_type.lower().strip() |
| 30 | + if data_type.startswith('boolean'): |
| 31 | + return pl.Boolean |
| 32 | + elif data_type.startswith('byte'): |
| 33 | + return pl.UInt8 |
| 34 | + elif data_type.startswith('integer'): |
| 35 | + return pl.Int32 |
| 36 | + elif data_type.startswith('long integer'): |
| 37 | + return pl.Int64 |
| 38 | + elif data_type.startswith('currency'): |
| 39 | + return pl.Decimal |
| 40 | + elif data_type.startswith('single'): |
| 41 | + return pl.Float32 |
| 42 | + elif data_type.startswith('double'): |
| 43 | + return pl.Float64 |
| 44 | + elif data_type.startswith('datetime'): |
| 45 | + return pl.Datetime |
| 46 | + elif data_type.startswith('binary'): |
| 47 | + return pl.Binary |
| 48 | + elif data_type.startswith('text'): |
| 49 | + return pl.String |
| 50 | + elif data_type.startswith('ole'): |
| 51 | + return pl.String # maybe there's a better option |
| 52 | + elif "integer" in data_type: |
| 53 | + # this shouldn't happen, as both 'integer' and 'long integer' are already handled |
| 54 | + return pl.Int32 |
| 55 | + elif data_type.startswith('memo'): # 'memo/hyperlink' |
| 56 | + return pl.String |
| 57 | + elif data_type.startswith('hyperlink'): |
| 58 | + # Might not be real |
| 59 | + return pl.String |
| 60 | + elif data_type.startswith('replication id'): |
| 61 | + return pl.String |
| 62 | + elif data_type.startswith('date'): |
| 63 | + # Might not be real |
| 64 | + return pl.Date |
| 65 | + #raise ValueError(f"Unknown data type: {data_type}") |
| 66 | + return None |
| 67 | + |
| 68 | +def _extract_data_type_definitions(defs_str: str) -> dict[str, str]: |
| 69 | + defs = {} |
| 70 | + lines = defs_str.splitlines() |
| 71 | + for line in lines: |
| 72 | + type_def_match = DATA_TYPE_DEF_RE.match(line) |
| 73 | + if type_def_match: |
| 74 | + column_name = type_def_match.group('column_name') |
| 75 | + data_type = type_def_match.group('data_type') |
| 76 | + defs[column_name] = data_type |
| 77 | + return defs |
| 78 | + |
| 79 | +def _read_table_mdb_schema(rdb_file: Union[str, os.PathLike], table_name: str, encoding: str = 'utf-8') -> dict[str, str]: |
| 80 | + """ |
| 81 | + Reads the schema of a given database using 'mdb-schema', and returns it in a dictionary representation of the mdb-schema output. |
| 82 | +
|
| 83 | + :param rdb_file: The MS Access database file. |
| 84 | + :param encoding: The schema encoding. |
| 85 | + :return: a dictionary of `{column_name: access_data_type}` |
| 86 | + """ |
| 87 | + cmd = [ |
| 88 | + 'mdb-schema', |
| 89 | + '--no-default-values', # TODO: could add these as arguments in case anyone ever wants to use them |
| 90 | + '--no-not_empty', |
| 91 | + '--no-comments', |
| 92 | + '--no-indexes', |
| 93 | + '--no-relations', |
| 94 | + '--table', table_name, |
| 95 | + rdb_file] |
| 96 | + cmd_output = subprocess.check_output(cmd) |
| 97 | + cmd_output = cmd_output.decode(encoding) |
| 98 | + lines = cmd_output.splitlines() |
| 99 | + schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-')) |
| 100 | + |
| 101 | + create_table_matches = CREATE_TABLE_RE.findall(schema_ddl) |
| 102 | + if len(create_table_matches) == 0: |
| 103 | + raise ValueError(f"Table schema {table_name} not found in 'mdb-schema' output.") |
| 104 | + if len(create_table_matches) > 1: |
| 105 | + # TODO: could be a warning |
| 106 | + raise ValueError(f"Multiple table schemas found for {table_name} in 'mdb-schema' output.") |
| 107 | + |
| 108 | + table_name_mdb, defs = create_table_matches[0] |
| 109 | + if table_name_mdb != table_name: |
| 110 | + raise ValueError(f"Table name mismatch from 'mdb-schema' response: table_name_arg={table_name}, {table_name_mdb=}") |
| 111 | + |
| 112 | + pl_schema = _extract_data_type_definitions(defs) |
| 113 | + return pl_schema |
| 114 | + |
| 115 | + |
| 116 | +def _convert_mdb_schema_to_polars_schema(mdb_schema: dict[str, pl.DataType], implicit_string: bool = True) -> dict[str, pl.DataType]: |
| 117 | + """ |
| 118 | + Converts a table's schema from `_read_table_mdb_schema(...)` format to Polars schema format. |
| 119 | +
|
| 120 | + :param schema: the output of `read_schema` |
| 121 | + :param implicit_string: If true, mark strings and unknown datatypes as `pl.String`. Otherwise, raise an error on unhandled SQL data types. |
| 122 | + :return: a dictionary of `{column_name: pl.DataType}` |
| 123 | + """ |
| 124 | + |
| 125 | + pl_table_schema: dict[str, pl.DataType] = {} |
| 126 | + for column, data_type in mdb_schema.items(): |
| 127 | + pl_data_type = _convert_data_type_from_access_to_polars(data_type) |
| 128 | + if pl_data_type is not None: |
| 129 | + pl_table_schema[column] = pl_data_type |
| 130 | + elif implicit_string: |
| 131 | + pl_table_schema[column] = pl.String |
| 132 | + else: |
| 133 | + raise ValueError(f"Unhandled data type: {column=}, {data_type=}") |
| 134 | + return pl_table_schema |
| 135 | + |
| 136 | + |
| 137 | +def read_table(rdb_file: Union[str, os.PathLike], table_name: str, data_encoding: str = 'utf-8', implicit_string: bool = True) -> pl.DataFrame: |
| 138 | + """ |
| 139 | + Read a MS Access database as a Polars DataFrame. |
| 140 | +
|
| 141 | + :param rdb_file: The MS Access database file. |
| 142 | + :param table_name: The name of the table to process. |
| 143 | + :param implicit_string: If true, mark strings and unknown datatypes as `pl.String`. Otherwise, raise an error on unhandled SQL data types. |
| 144 | + :return: a `pl.DataFrame` |
| 145 | + """ |
| 146 | + schema_encoding = 'utf-8' |
| 147 | + mdb_schema = _read_table_mdb_schema(rdb_file, table_name, schema_encoding) |
| 148 | + pl_schema_target = _convert_mdb_schema_to_polars_schema(mdb_schema, implicit_string) |
| 149 | + |
| 150 | + # transform the schema to a format that Polars can read (pl_schema_target -> pl_schema_read) |
| 151 | + pl_schema_read: dict[str, pl.DataType] = {} |
| 152 | + boolean_col_names: list[str] = [] |
| 153 | + binary_col_names: list[str] = [] |
| 154 | + for col_name, col_type in pl_schema_target.items(): |
| 155 | + if col_type == pl.Binary: |
| 156 | + # must read as string (hex), then convert to binary |
| 157 | + pl_schema_read[col_name] = pl.String |
| 158 | + binary_col_names.append(col_name) |
| 159 | + elif col_type == pl.Boolean: |
| 160 | + # must read as UInt8 (0, 1, NULL), then convert to pl.Boolean after |
| 161 | + pl_schema_read[col_name] = pl.UInt8 |
| 162 | + boolean_col_names.append(col_name) |
| 163 | + else: |
| 164 | + pl_schema_read[col_name] = col_type |
| 165 | + |
| 166 | + cmd = ['mdb-export', '--bin=hex', '--date-format', '%Y-%m-%d', '--datetime-format', '%Y-%m-%dT%H:%M:%S', rdb_file, table_name] |
| 167 | + |
| 168 | + # Debug: |
| 169 | + # data_str = subprocess.check_output(cmd).decode(data_encoding) |
| 170 | + # with open('test.csv', 'w') as f: |
| 171 | + # f.write(data_str) |
| 172 | + |
| 173 | + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) |
| 174 | + # silence this warning: UserWarning: Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance |
| 175 | + |
| 176 | + with warnings.catch_warnings(): |
| 177 | + warnings.filterwarnings("ignore", message="Polars found a filename.*") |
| 178 | + |
| 179 | + df = pl.read_csv( |
| 180 | + proc.stdout, |
| 181 | + schema=pl_schema_read, |
| 182 | + encoding=data_encoding, |
| 183 | + # truncate_ragged_lines=True, |
| 184 | + ) |
| 185 | + |
| 186 | + # convert binary columns |
| 187 | + df = df.with_columns([ |
| 188 | + pl.col(col_name).str.decode('hex') |
| 189 | + for col_name in binary_col_names |
| 190 | + ]) |
| 191 | + |
| 192 | + # convert boolean columns |
| 193 | + df = df.with_columns([ |
| 194 | + (pl.col(col_name) > pl.lit(0)).cast(pl.Boolean).alias(col_name) |
| 195 | + for col_name in boolean_col_names |
| 196 | + ]) |
| 197 | + |
| 198 | + return df |
0 commit comments