python_scripts/file-splitter.py at main · StefTzor/python_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python3
"""
Large File Splitter for Spreadsheet Applications

This script takes a large CSV or Excel file and splits it into multiple smaller files
that are compatible with spreadsheet applications like Google Sheets, Excel, or LibreOffice.

Arguments and Options:

Positional Arguments:
  input_file          Path to the input CSV or Excel file to be split.

Optional Arguments:
  --output-dir        Directory to save the generated output files.
                      (Default: Auto-generates a folder named '<input_file>_split_<timestamp>')

  --target-app        The spreadsheet software you intend to open the files with.
                      Automatically adjusts default row and size limits.
                      Choices: 'excel', 'google_sheets', 'libreoffice' (Default: 'excel')

  --output-format     Forces the output format of the split files.
                      Choices: 'csv', 'excel' (Default: Auto-detected based on input file)

  --max-rows          Overrides the target app limits to set a specific maximum number
                      of rows per output file. (e.g., --max-rows 10000)

  --max-size-mb       Overrides the target app limits to set a specific maximum file
                      size in megabytes per output file.

  --chunk-size        The number of rows processed into your computer's memory at
                      one time. Adjust this if you run into RAM constraints.
                      (Default: 10000)

Usage Examples:
  Split a CSV for Google Sheets:
  $ python file-splitter.py data.csv --target-app google_sheets

  Split an Excel file into exactly 10,000 rows per file, outputting as CSV:
  $ python file-splitter.py massive_data.xlsx --max-rows 10000 --output-format csv
"""

import pandas as pd
import os
import argparse
import time
import sys
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
import math
import csv

# Optional column type conversions — leave empty if not needed
DEFAULT_COLUMN_TYPE_MAP = {
    "PRICE": float,
    "PRICE_CATEGORY": int,
    "ROW": str,
    "SEAT": str,
}


class FileSplitter:
    """
    Splits large CSV and Excel files into smaller formats based on spreadsheet constraints.
    """
    LIMITS = {
        'excel': {'rows': 1048576, 'size_mb': 20},
        'google_sheets': {'rows': 10000000, 'size_mb': 15},
        'libreoffice': {'rows': 1048576, 'size_mb': 20}
    }

    def __init__(self, input_file, output_dir=None, target_app='excel', output_format=None,
                 max_rows=None, max_size_mb=None, chunk_size=10000, column_type_map=None):
        self.input_file = input_file
        self.chunk_size = chunk_size
        self.input_ext = os.path.splitext(input_file)[1].lower()

        # Setup output directory
        if output_dir is None:
            base_name = os.path.splitext(os.path.basename(input_file))[0]
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            output_dir = f"{base_name}_split_{timestamp}"
        self.output_dir = output_dir
        Path(self.output_dir).mkdir(parents=True, exist_ok=True)

        # Configure application limits
        self.target_app = target_app
        if max_rows is None:
            self.max_rows = min(self.LIMITS[target_app]['rows'],
                                200000 if target_app in ['excel', 'libreoffice'] else 50000)
        else:
            self.max_rows = max_rows

        self.max_size_mb = max_size_mb if max_size_mb is not None else self.LIMITS[target_app]['size_mb']
        self.output_format = output_format.lower() if output_format else ('excel' if self.input_ext in ['.xlsx', '.xls'] else 'csv')
        self.output_ext = '.xlsx' if self.output_format == 'excel' else '.csv'

        # Type mapping initialization
        self.column_type_map = column_type_map if column_type_map is not None else {}

        # Runtime variables
        self.headers = None
        self.start_time = None
        self.total_rows = 0
        self.processed_rows = 0
        self.num_files_created = 0
        self.avg_row_size_bytes = 0

    def load_file_info(self):
        """Analyzes the input file to determine row count, size, and estimated splits."""
        print(f"Analyzing file: {self.input_file}")
        self.start_time = time.time()
        try:
            file_size_bytes = os.path.getsize(self.input_file)
            file_size_mb = file_size_bytes / (1024 * 1024)
            print(f"File size: {file_size_mb:.2f} MB")

            if self.input_ext in ['.csv']:
                with open(self.input_file, 'r') as f:
                    self.total_rows = sum(1 for _ in f) + 1
                if self.total_rows > 1:
                    self.avg_row_size_bytes = file_size_bytes / self.total_rows

            elif self.input_ext in ['.xlsx', '.xls']:
                xl = pd.ExcelFile(self.input_file)
                sheet_name = xl.sheet_names[0]
                sample_chunk = pd.read_excel(self.input_file, sheet_name=sheet_name, nrows=self.chunk_size)
                self.headers = list(sample_chunk.columns)
                df_info = pd.read_excel(self.input_file, sheet_name=sheet_name)
                self.total_rows = len(df_info)
                if self.total_rows > 1:
                    self.avg_row_size_bytes = file_size_bytes / self.total_rows
                del df_info
            else:
                print(f"Unsupported file format: {self.input_ext}")
                return False

            print(f"Total rows: {self.total_rows:,}")
            print(f"Estimated average row size: {self.avg_row_size_bytes:.2f} bytes")
            max_size_bytes = self.max_size_mb * 1024 * 1024
            rows_per_file_by_size = int(max_size_bytes / self.avg_row_size_bytes) if self.avg_row_size_bytes > 0 else self.max_rows
            rows_per_file = min(self.max_rows, rows_per_file_by_size)
            estimated_num_files = math.ceil(self.total_rows / rows_per_file)
            print(f"Will split into approximately {estimated_num_files} files with up to {rows_per_file:,} rows each")
            return True
        except Exception as e:
            print(f"Error analyzing file: {e}")
            return False

    def get_output_filename(self, file_index):
        """Generates the sequential filename for output chunks."""
        base_name = os.path.splitext(os.path.basename(self.input_file))[0]
        return os.path.join(self.output_dir, f"{base_name}_part{file_index + 1:03d}{self.output_ext}")

    def split_file(self):
        """Main generator loop that chunks the data and triggers file writes."""
        print("\n" + "="*50)
        print("STARTING FILE SPLITTING")
        print("="*50)
        if not self.load_file_info():
            return False

        split_start_time = time.time()
        try:
            max_size_bytes = self.max_size_mb * 1024 * 1024
            rows_per_file_by_size = int(max_size_bytes / self.avg_row_size_bytes) if self.avg_row_size_bytes > 0 else self.max_rows
            rows_per_file = min(self.max_rows, rows_per_file_by_size)
            if rows_per_file <= 0:
                rows_per_file = self.chunk_size
            print(f"Using {rows_per_file:,} rows per output file")

            reader = None
            current_buffer = []
            current_output_rows = 0
            file_index = 0

            # File reading initialization
            if self.input_ext in ['.csv']:
                reader = pd.read_csv(
                    self.input_file,
                    chunksize=self.chunk_size,
                    on_bad_lines='warn',
                    engine='python',
                    quoting=csv.QUOTE_MINIMAL,
                    quotechar='"',
                    dtype=str
                )
            elif self.input_ext in ['.xlsx', '.xls']:
                full_data = pd.read_excel(self.input_file)
                self.headers = list(full_data.columns)
                def excel_chunk_generator(data, chunk_size):
                    for i in range(0, len(data), chunk_size):
                        yield data.iloc[i:i + chunk_size]
                reader = excel_chunk_generator(full_data, self.chunk_size)
                del full_data
            else:
                return False

            with tqdm(total=self.total_rows, desc="Processing rows", unit="row") as pbar:
                for chunk in reader:
                    chunk_len = len(chunk)
                    self.processed_rows += chunk_len

                    if self.headers is None and self.input_ext in ['.csv']:
                        self.headers = list(chunk.columns)

                    remaining_chunk = chunk

                    # Slice DataFrames instead of using iterrows() for massive performance gains
                    while not remaining_chunk.empty:
                        space_left = rows_per_file - current_output_rows

                        if len(remaining_chunk) < space_left:
                            # The whole chunk fits into the current file output
                            current_buffer.append(remaining_chunk)
                            current_output_rows += len(remaining_chunk)
                            remaining_chunk = pd.DataFrame()  # Clear to exit inner loop

                        else:
                            # Slice the chunk exactly to fill the current file target
                            part_to_add = remaining_chunk.iloc[:space_left]
                            current_buffer.append(part_to_add)

                            # Combine buffered chunks and write
                            combined_df = pd.concat(current_buffer, ignore_index=True)
                            self._write_output_file(combined_df, file_index)

                            # Reset states for the next file
                            file_index += 1
                            self.num_files_created += 1
                            current_buffer = []
                            current_output_rows = 0

                            # Retain the unwritten portion of the chunk for the next iteration
                            remaining_chunk = remaining_chunk.iloc[space_left:]

                    # Progress bar and terminal outputs
                    pbar.update(chunk_len)
                    if self.processed_rows % 50000 == 0 or self.processed_rows == self.total_rows:
                        elapsed = time.time() - self.start_time
                        rows_per_sec = self.processed_rows / elapsed if elapsed > 0 else 0
                        print(f"\nProcessed {self.processed_rows:,}/{self.total_rows:,} rows "
                              f"({self.processed_rows/self.total_rows*100:.1f}%) "
                              f"at {rows_per_sec:.1f} rows/sec")
                        print(f"Created {self.num_files_created} files so far")

            # Write out any leftover data that didn't reach the 'rows_per_file' threshold
            if current_buffer:
                combined_df = pd.concat(current_buffer, ignore_index=True)
                self._write_output_file(combined_df, file_index)
                self.num_files_created += 1

            split_time = time.time() - split_start_time
            total_time = time.time() - self.start_time
            print("\n" + "="*50)
            print("FILE SPLITTING COMPLETED")
            print("="*50)
            print(f"Total processing time: {total_time:.2f} seconds")
            print(f"Splitting time: {split_time:.2f} seconds")
            print(f"Created {self.num_files_created} files in {self.output_dir}")
            return True

        except KeyboardInterrupt:
            print("\nProcess interrupted by user.")
            return False
        except Exception as e:
            print(f"\nError during file splitting: {e}")
            return False

    def _write_output_file(self, df, file_index):
        """Applies data types and exports the final dataframe chunk to disk."""
        output_file = self.get_output_filename(file_index)
        try:
            df = self._apply_column_type_map(df)

            if self.output_format == 'csv':
                df.to_csv(output_file, index=False)
            else:
                df.to_excel(output_file, index=False)

            file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
            print(f"Created file {file_index + 1}: {output_file} ({len(df):,} rows, {file_size_mb:.2f} MB)")
            return True
        except Exception as e:
            print(f"Error writing output file {output_file}: {e}")
            return False

    def _apply_column_type_map(self, df):
        """Converts specific column types based on the initialized configuration mapping."""
        if not self.column_type_map:
            return df

        for col, dtype in self.column_type_map.items():
            if col in df.columns:
                try:
                    if dtype == int:
                        df[col] = df[col].apply(lambda x: int(float(x)) if str(x).strip() not in ['', 'nan', 'None'] else None)
                    elif dtype == float:
                        df[col] = df[col].apply(lambda x: float(x) if str(x).strip() not in ['', 'nan', 'None'] else None)
                    else:
                        df[col] = df[col].apply(lambda x: dtype(x) if pd.notnull(x) and str(x).strip() != '' else None)
                except Exception as e:
                    print(f"⚠️ Warning: Could not convert column '{col}' to {dtype.__name__}: {e}")
        return df


def main():
    parser = argparse.ArgumentParser(description='Split large CSV/Excel files into smaller files for spreadsheet applications.')
    parser.add_argument('input_file', help='Path to the input CSV or Excel file')
    parser.add_argument('--output-dir', help='Directory to save output files')
    parser.add_argument('--target-app', choices=['excel', 'google_sheets', 'libreoffice'], default='excel')
    parser.add_argument('--output-format', choices=['csv', 'excel'])
    parser.add_argument('--max-rows', type=int)
    parser.add_argument('--max-size-mb', type=int)
    parser.add_argument('--chunk-size', type=int, default=10000)
    args = parser.parse_args()

    print(f"Large File Splitter for Spreadsheet Applications")
    print(f"Input file: {args.input_file}")
    print(f"Output directory: {args.output_dir or 'Auto-generated'}")
    print(f"Target application: {args.target_app}")
    print(f"Output format: {args.output_format or 'Auto-detected'}")
    print(f"Maximum rows per file: {args.max_rows or 'Default for ' + args.target_app}")
    print(f"Maximum file size (MB): {args.max_size_mb or 'Default for ' + args.target_app}")
    print(f"Processing chunk size: {args.chunk_size}")

    # Initialize the class and pass in the default mapping
    splitter = FileSplitter(
        input_file=args.input_file,
        output_dir=args.output_dir,
        target_app=args.target_app,
        output_format=args.output_format,
        max_rows=args.max_rows,
        max_size_mb=args.max_size_mb,
        chunk_size=args.chunk_size,
        column_type_map=DEFAULT_COLUMN_TYPE_MAP
    )

    try:
        success = splitter.split_file()
        if success:
            print("\nFile splitting completed successfully.")
            sys.exit(0)
        else:
            print("\nFile splitting failed.")
            sys.exit(1)
    except KeyboardInterrupt:
        print("\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\nUnexpected error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()