-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfile-splitter.py
More file actions
356 lines (301 loc) · 15.5 KB
/
file-splitter.py
File metadata and controls
356 lines (301 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python3
"""
Large File Splitter for Spreadsheet Applications
This script takes a large CSV or Excel file and splits it into multiple smaller files
that are compatible with spreadsheet applications like Google Sheets, Excel, or LibreOffice.
Arguments and Options:
Positional Arguments:
input_file Path to the input CSV or Excel file to be split.
Optional Arguments:
--output-dir Directory to save the generated output files.
(Default: Auto-generates a folder named '<input_file>_split_<timestamp>')
--target-app The spreadsheet software you intend to open the files with.
Automatically adjusts default row and size limits.
Choices: 'excel', 'google_sheets', 'libreoffice' (Default: 'excel')
--output-format Forces the output format of the split files.
Choices: 'csv', 'excel' (Default: Auto-detected based on input file)
--max-rows Overrides the target app limits to set a specific maximum number
of rows per output file. (e.g., --max-rows 10000)
--max-size-mb Overrides the target app limits to set a specific maximum file
size in megabytes per output file.
--chunk-size The number of rows processed into your computer's memory at
one time. Adjust this if you run into RAM constraints.
(Default: 10000)
Usage Examples:
Split a CSV for Google Sheets:
$ python file-splitter.py data.csv --target-app google_sheets
Split an Excel file into exactly 10,000 rows per file, outputting as CSV:
$ python file-splitter.py massive_data.xlsx --max-rows 10000 --output-format csv
"""
import pandas as pd
import os
import argparse
import time
import sys
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
import math
import csv
# Optional column type conversions — leave empty if not needed
DEFAULT_COLUMN_TYPE_MAP = {
"PRICE": float,
"PRICE_CATEGORY": int,
"ROW": str,
"SEAT": str,
}
class FileSplitter:
"""
Splits large CSV and Excel files into smaller formats based on spreadsheet constraints.
"""
LIMITS = {
'excel': {'rows': 1048576, 'size_mb': 20},
'google_sheets': {'rows': 10000000, 'size_mb': 15},
'libreoffice': {'rows': 1048576, 'size_mb': 20}
}
def __init__(self, input_file, output_dir=None, target_app='excel', output_format=None,
max_rows=None, max_size_mb=None, chunk_size=10000, column_type_map=None):
self.input_file = input_file
self.chunk_size = chunk_size
self.input_ext = os.path.splitext(input_file)[1].lower()
# Setup output directory
if output_dir is None:
base_name = os.path.splitext(os.path.basename(input_file))[0]
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = f"{base_name}_split_{timestamp}"
self.output_dir = output_dir
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
# Configure application limits
self.target_app = target_app
if max_rows is None:
self.max_rows = min(self.LIMITS[target_app]['rows'],
200000 if target_app in ['excel', 'libreoffice'] else 50000)
else:
self.max_rows = max_rows
self.max_size_mb = max_size_mb if max_size_mb is not None else self.LIMITS[target_app]['size_mb']
self.output_format = output_format.lower() if output_format else ('excel' if self.input_ext in ['.xlsx', '.xls'] else 'csv')
self.output_ext = '.xlsx' if self.output_format == 'excel' else '.csv'
# Type mapping initialization
self.column_type_map = column_type_map if column_type_map is not None else {}
# Runtime variables
self.headers = None
self.start_time = None
self.total_rows = 0
self.processed_rows = 0
self.num_files_created = 0
self.avg_row_size_bytes = 0
def load_file_info(self):
"""Analyzes the input file to determine row count, size, and estimated splits."""
print(f"Analyzing file: {self.input_file}")
self.start_time = time.time()
try:
file_size_bytes = os.path.getsize(self.input_file)
file_size_mb = file_size_bytes / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB")
if self.input_ext in ['.csv']:
with open(self.input_file, 'r') as f:
self.total_rows = sum(1 for _ in f) + 1
if self.total_rows > 1:
self.avg_row_size_bytes = file_size_bytes / self.total_rows
elif self.input_ext in ['.xlsx', '.xls']:
xl = pd.ExcelFile(self.input_file)
sheet_name = xl.sheet_names[0]
sample_chunk = pd.read_excel(self.input_file, sheet_name=sheet_name, nrows=self.chunk_size)
self.headers = list(sample_chunk.columns)
df_info = pd.read_excel(self.input_file, sheet_name=sheet_name)
self.total_rows = len(df_info)
if self.total_rows > 1:
self.avg_row_size_bytes = file_size_bytes / self.total_rows
del df_info
else:
print(f"Unsupported file format: {self.input_ext}")
return False
print(f"Total rows: {self.total_rows:,}")
print(f"Estimated average row size: {self.avg_row_size_bytes:.2f} bytes")
max_size_bytes = self.max_size_mb * 1024 * 1024
rows_per_file_by_size = int(max_size_bytes / self.avg_row_size_bytes) if self.avg_row_size_bytes > 0 else self.max_rows
rows_per_file = min(self.max_rows, rows_per_file_by_size)
estimated_num_files = math.ceil(self.total_rows / rows_per_file)
print(f"Will split into approximately {estimated_num_files} files with up to {rows_per_file:,} rows each")
return True
except Exception as e:
print(f"Error analyzing file: {e}")
return False
def get_output_filename(self, file_index):
"""Generates the sequential filename for output chunks."""
base_name = os.path.splitext(os.path.basename(self.input_file))[0]
return os.path.join(self.output_dir, f"{base_name}_part{file_index + 1:03d}{self.output_ext}")
def split_file(self):
"""Main generator loop that chunks the data and triggers file writes."""
print("\n" + "="*50)
print("STARTING FILE SPLITTING")
print("="*50)
if not self.load_file_info():
return False
split_start_time = time.time()
try:
max_size_bytes = self.max_size_mb * 1024 * 1024
rows_per_file_by_size = int(max_size_bytes / self.avg_row_size_bytes) if self.avg_row_size_bytes > 0 else self.max_rows
rows_per_file = min(self.max_rows, rows_per_file_by_size)
if rows_per_file <= 0:
rows_per_file = self.chunk_size
print(f"Using {rows_per_file:,} rows per output file")
reader = None
current_buffer = []
current_output_rows = 0
file_index = 0
# File reading initialization
if self.input_ext in ['.csv']:
reader = pd.read_csv(
self.input_file,
chunksize=self.chunk_size,
on_bad_lines='warn',
engine='python',
quoting=csv.QUOTE_MINIMAL,
quotechar='"',
dtype=str
)
elif self.input_ext in ['.xlsx', '.xls']:
full_data = pd.read_excel(self.input_file)
self.headers = list(full_data.columns)
def excel_chunk_generator(data, chunk_size):
for i in range(0, len(data), chunk_size):
yield data.iloc[i:i + chunk_size]
reader = excel_chunk_generator(full_data, self.chunk_size)
del full_data
else:
return False
with tqdm(total=self.total_rows, desc="Processing rows", unit="row") as pbar:
for chunk in reader:
chunk_len = len(chunk)
self.processed_rows += chunk_len
if self.headers is None and self.input_ext in ['.csv']:
self.headers = list(chunk.columns)
remaining_chunk = chunk
# Slice DataFrames instead of using iterrows() for massive performance gains
while not remaining_chunk.empty:
space_left = rows_per_file - current_output_rows
if len(remaining_chunk) < space_left:
# The whole chunk fits into the current file output
current_buffer.append(remaining_chunk)
current_output_rows += len(remaining_chunk)
remaining_chunk = pd.DataFrame() # Clear to exit inner loop
else:
# Slice the chunk exactly to fill the current file target
part_to_add = remaining_chunk.iloc[:space_left]
current_buffer.append(part_to_add)
# Combine buffered chunks and write
combined_df = pd.concat(current_buffer, ignore_index=True)
self._write_output_file(combined_df, file_index)
# Reset states for the next file
file_index += 1
self.num_files_created += 1
current_buffer = []
current_output_rows = 0
# Retain the unwritten portion of the chunk for the next iteration
remaining_chunk = remaining_chunk.iloc[space_left:]
# Progress bar and terminal outputs
pbar.update(chunk_len)
if self.processed_rows % 50000 == 0 or self.processed_rows == self.total_rows:
elapsed = time.time() - self.start_time
rows_per_sec = self.processed_rows / elapsed if elapsed > 0 else 0
print(f"\nProcessed {self.processed_rows:,}/{self.total_rows:,} rows "
f"({self.processed_rows/self.total_rows*100:.1f}%) "
f"at {rows_per_sec:.1f} rows/sec")
print(f"Created {self.num_files_created} files so far")
# Write out any leftover data that didn't reach the 'rows_per_file' threshold
if current_buffer:
combined_df = pd.concat(current_buffer, ignore_index=True)
self._write_output_file(combined_df, file_index)
self.num_files_created += 1
split_time = time.time() - split_start_time
total_time = time.time() - self.start_time
print("\n" + "="*50)
print("FILE SPLITTING COMPLETED")
print("="*50)
print(f"Total processing time: {total_time:.2f} seconds")
print(f"Splitting time: {split_time:.2f} seconds")
print(f"Created {self.num_files_created} files in {self.output_dir}")
return True
except KeyboardInterrupt:
print("\nProcess interrupted by user.")
return False
except Exception as e:
print(f"\nError during file splitting: {e}")
return False
def _write_output_file(self, df, file_index):
"""Applies data types and exports the final dataframe chunk to disk."""
output_file = self.get_output_filename(file_index)
try:
df = self._apply_column_type_map(df)
if self.output_format == 'csv':
df.to_csv(output_file, index=False)
else:
df.to_excel(output_file, index=False)
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"Created file {file_index + 1}: {output_file} ({len(df):,} rows, {file_size_mb:.2f} MB)")
return True
except Exception as e:
print(f"Error writing output file {output_file}: {e}")
return False
def _apply_column_type_map(self, df):
"""Converts specific column types based on the initialized configuration mapping."""
if not self.column_type_map:
return df
for col, dtype in self.column_type_map.items():
if col in df.columns:
try:
if dtype == int:
df[col] = df[col].apply(lambda x: int(float(x)) if str(x).strip() not in ['', 'nan', 'None'] else None)
elif dtype == float:
df[col] = df[col].apply(lambda x: float(x) if str(x).strip() not in ['', 'nan', 'None'] else None)
else:
df[col] = df[col].apply(lambda x: dtype(x) if pd.notnull(x) and str(x).strip() != '' else None)
except Exception as e:
print(f"⚠️ Warning: Could not convert column '{col}' to {dtype.__name__}: {e}")
return df
def main():
parser = argparse.ArgumentParser(description='Split large CSV/Excel files into smaller files for spreadsheet applications.')
parser.add_argument('input_file', help='Path to the input CSV or Excel file')
parser.add_argument('--output-dir', help='Directory to save output files')
parser.add_argument('--target-app', choices=['excel', 'google_sheets', 'libreoffice'], default='excel')
parser.add_argument('--output-format', choices=['csv', 'excel'])
parser.add_argument('--max-rows', type=int)
parser.add_argument('--max-size-mb', type=int)
parser.add_argument('--chunk-size', type=int, default=10000)
args = parser.parse_args()
print(f"Large File Splitter for Spreadsheet Applications")
print(f"Input file: {args.input_file}")
print(f"Output directory: {args.output_dir or 'Auto-generated'}")
print(f"Target application: {args.target_app}")
print(f"Output format: {args.output_format or 'Auto-detected'}")
print(f"Maximum rows per file: {args.max_rows or 'Default for ' + args.target_app}")
print(f"Maximum file size (MB): {args.max_size_mb or 'Default for ' + args.target_app}")
print(f"Processing chunk size: {args.chunk_size}")
# Initialize the class and pass in the default mapping
splitter = FileSplitter(
input_file=args.input_file,
output_dir=args.output_dir,
target_app=args.target_app,
output_format=args.output_format,
max_rows=args.max_rows,
max_size_mb=args.max_size_mb,
chunk_size=args.chunk_size,
column_type_map=DEFAULT_COLUMN_TYPE_MAP
)
try:
success = splitter.split_file()
if success:
print("\nFile splitting completed successfully.")
sys.exit(0)
else:
print("\nFile splitting failed.")
sys.exit(1)
except KeyboardInterrupt:
print("\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\nUnexpected error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()