Skip to content

Commit 2573645

Browse files
committed
Create project
1 parent 2140afc commit 2573645

3 files changed

Lines changed: 224 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[project]
2+
name = "polars_access_mdbtools"
3+
version = "0.0.1"
4+
authors = [
5+
{ name="DeflateAwesome" },
6+
]
7+
description = "A library for reading tables from an Access database into Polars dataframes, using mdbtools"
8+
readme = "README.md"
9+
requires-python = ">=3.8"
10+
classifiers = [
11+
"Programming Language :: Python :: 3",
12+
"License :: OSI Approved :: MIT License",
13+
"Operating System :: OS Independent",
14+
]
15+
dependencies = [
16+
"polars",
17+
]
18+
19+
[project.urls]
20+
Homepage = "https://github.com/pypa/sampleproject"
21+
Issues = "https://github.com/pypa/sampleproject/issues"
22+
23+
[build-system]
24+
requires = ["hatchling"]
25+
build-backend = "hatchling.build"

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
polars
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import re
2+
import subprocess
3+
from typing import Union
4+
import os
5+
import warnings
6+
7+
import polars as pl
8+
9+
10+
CREATE_TABLE_RE = re.compile("CREATE TABLE \[([^]]+)\]\s+\((.*?\));",
11+
re.MULTILINE | re.DOTALL)
12+
13+
DATA_TYPE_DEF_RE = re.compile(r"^\s*\[(?P<column_name>[^\]]+)\]\s*(?P<data_type>[A-Za-z]+[^,]+),?")
14+
15+
def list_table_names(rdb_file: Union[str, os.PathLike], encoding: str = 'utf-8') -> list[str]:
16+
"""
17+
Lists the names of the tables in a given database using 'mdb-tables'.
18+
19+
:param rdb_file: The MS Access database file.
20+
:param encoding: The content encoding of the output of the mdb-tables command.
21+
:return: A list of the tables in a given database.
22+
"""
23+
tables = subprocess.check_output(['mdb-tables', "--single-column", rdb_file]).decode(encoding)
24+
return tables.strip().split("\n")
25+
26+
27+
def _convert_data_type_from_access_to_polars(data_type: str) -> Union[pl.DataType, None]:
28+
# Source: https://github.com/mdbtools/mdbtools/blob/0e77b68e76701ddc7aacb2c2e10ecdad1bb530ec/src/libmdb/backend.c#L27
29+
data_type = data_type.lower().strip()
30+
if data_type.startswith('boolean'):
31+
return pl.Boolean
32+
elif data_type.startswith('byte'):
33+
return pl.UInt8
34+
elif data_type.startswith('integer'):
35+
return pl.Int32
36+
elif data_type.startswith('long integer'):
37+
return pl.Int64
38+
elif data_type.startswith('currency'):
39+
return pl.Decimal
40+
elif data_type.startswith('single'):
41+
return pl.Float32
42+
elif data_type.startswith('double'):
43+
return pl.Float64
44+
elif data_type.startswith('datetime'):
45+
return pl.Datetime
46+
elif data_type.startswith('binary'):
47+
return pl.Binary
48+
elif data_type.startswith('text'):
49+
return pl.String
50+
elif data_type.startswith('ole'):
51+
return pl.String # maybe there's a better option
52+
elif "integer" in data_type:
53+
# this shouldn't happen, as both 'integer' and 'long integer' are already handled
54+
return pl.Int32
55+
elif data_type.startswith('memo'): # 'memo/hyperlink'
56+
return pl.String
57+
elif data_type.startswith('hyperlink'):
58+
# Might not be real
59+
return pl.String
60+
elif data_type.startswith('replication id'):
61+
return pl.String
62+
elif data_type.startswith('date'):
63+
# Might not be real
64+
return pl.Date
65+
#raise ValueError(f"Unknown data type: {data_type}")
66+
return None
67+
68+
def _extract_data_type_definitions(defs_str: str) -> dict[str, str]:
69+
defs = {}
70+
lines = defs_str.splitlines()
71+
for line in lines:
72+
type_def_match = DATA_TYPE_DEF_RE.match(line)
73+
if type_def_match:
74+
column_name = type_def_match.group('column_name')
75+
data_type = type_def_match.group('data_type')
76+
defs[column_name] = data_type
77+
return defs
78+
79+
def _read_table_mdb_schema(rdb_file: Union[str, os.PathLike], table_name: str, encoding: str = 'utf-8') -> dict[str, str]:
80+
"""
81+
Reads the schema of a given database using 'mdb-schema', and returns it in a dictionary representation of the mdb-schema output.
82+
83+
:param rdb_file: The MS Access database file.
84+
:param encoding: The schema encoding.
85+
:return: a dictionary of `{column_name: access_data_type}`
86+
"""
87+
cmd = [
88+
'mdb-schema',
89+
'--no-default-values', # TODO: could add these as arguments in case anyone ever wants to use them
90+
'--no-not_empty',
91+
'--no-comments',
92+
'--no-indexes',
93+
'--no-relations',
94+
'--table', table_name,
95+
rdb_file]
96+
cmd_output = subprocess.check_output(cmd)
97+
cmd_output = cmd_output.decode(encoding)
98+
lines = cmd_output.splitlines()
99+
schema_ddl = "\n".join(l for l in lines if l and not l.startswith('-'))
100+
101+
create_table_matches = CREATE_TABLE_RE.findall(schema_ddl)
102+
if len(create_table_matches) == 0:
103+
raise ValueError(f"Table schema {table_name} not found in 'mdb-schema' output.")
104+
if len(create_table_matches) > 1:
105+
# TODO: could be a warning
106+
raise ValueError(f"Multiple table schemas found for {table_name} in 'mdb-schema' output.")
107+
108+
table_name_mdb, defs = create_table_matches[0]
109+
if table_name_mdb != table_name:
110+
raise ValueError(f"Table name mismatch from 'mdb-schema' response: table_name_arg={table_name}, {table_name_mdb=}")
111+
112+
pl_schema = _extract_data_type_definitions(defs)
113+
return pl_schema
114+
115+
116+
def _convert_mdb_schema_to_polars_schema(mdb_schema: dict[str, pl.DataType], implicit_string: bool = True) -> dict[str, pl.DataType]:
117+
"""
118+
Converts a table's schema from `_read_table_mdb_schema(...)` format to Polars schema format.
119+
120+
:param schema: the output of `read_schema`
121+
:param implicit_string: If true, mark strings and unknown datatypes as `pl.String`. Otherwise, raise an error on unhandled SQL data types.
122+
:return: a dictionary of `{column_name: pl.DataType}`
123+
"""
124+
125+
pl_table_schema: dict[str, pl.DataType] = {}
126+
for column, data_type in mdb_schema.items():
127+
pl_data_type = _convert_data_type_from_access_to_polars(data_type)
128+
if pl_data_type is not None:
129+
pl_table_schema[column] = pl_data_type
130+
elif implicit_string:
131+
pl_table_schema[column] = pl.String
132+
else:
133+
raise ValueError(f"Unhandled data type: {column=}, {data_type=}")
134+
return pl_table_schema
135+
136+
137+
def read_table(rdb_file: Union[str, os.PathLike], table_name: str, data_encoding: str = 'utf-8', implicit_string: bool = True) -> pl.DataFrame:
138+
"""
139+
Read a MS Access database as a Polars DataFrame.
140+
141+
:param rdb_file: The MS Access database file.
142+
:param table_name: The name of the table to process.
143+
:param implicit_string: If true, mark strings and unknown datatypes as `pl.String`. Otherwise, raise an error on unhandled SQL data types.
144+
:return: a `pl.DataFrame`
145+
"""
146+
schema_encoding = 'utf-8'
147+
mdb_schema = _read_table_mdb_schema(rdb_file, table_name, schema_encoding)
148+
pl_schema_target = _convert_mdb_schema_to_polars_schema(mdb_schema, implicit_string)
149+
150+
# transform the schema to a format that Polars can read (pl_schema_target -> pl_schema_read)
151+
pl_schema_read: dict[str, pl.DataType] = {}
152+
boolean_col_names: list[str] = []
153+
binary_col_names: list[str] = []
154+
for col_name, col_type in pl_schema_target.items():
155+
if col_type == pl.Binary:
156+
# must read as string (hex), then convert to binary
157+
pl_schema_read[col_name] = pl.String
158+
binary_col_names.append(col_name)
159+
elif col_type == pl.Boolean:
160+
# must read as UInt8 (0, 1, NULL), then convert to pl.Boolean after
161+
pl_schema_read[col_name] = pl.UInt8
162+
boolean_col_names.append(col_name)
163+
else:
164+
pl_schema_read[col_name] = col_type
165+
166+
cmd = ['mdb-export', '--bin=hex', '--date-format', '%Y-%m-%d', '--datetime-format', '%Y-%m-%dT%H:%M:%S', rdb_file, table_name]
167+
168+
# Debug:
169+
# data_str = subprocess.check_output(cmd).decode(data_encoding)
170+
# with open('test.csv', 'w') as f:
171+
# f.write(data_str)
172+
173+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
174+
# silence this warning: UserWarning: Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance
175+
176+
with warnings.catch_warnings():
177+
warnings.filterwarnings("ignore", message="Polars found a filename.*")
178+
179+
df = pl.read_csv(
180+
proc.stdout,
181+
schema=pl_schema_read,
182+
encoding=data_encoding,
183+
# truncate_ragged_lines=True,
184+
)
185+
186+
# convert binary columns
187+
df = df.with_columns([
188+
pl.col(col_name).str.decode('hex')
189+
for col_name in binary_col_names
190+
])
191+
192+
# convert boolean columns
193+
df = df.with_columns([
194+
(pl.col(col_name) > pl.lit(0)).cast(pl.Boolean).alias(col_name)
195+
for col_name in boolean_col_names
196+
])
197+
198+
return df

0 commit comments

Comments
 (0)