-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv-column-fixer.py
More file actions
61 lines (48 loc) · 2.09 KB
/
csv-column-fixer.py
File metadata and controls
61 lines (48 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import csv
import re
from tqdm import tqdm
input_file = "/home/steftzor/Downloads/global-customer-export/global-customer-export-20250925.csv"
output_file = "/home/steftzor/Downloads/global-customer-export/global-customer-export-20250925_repaired_fast.csv"
log_file = "/home/steftzor/Downloads/global-customer-export/global-customer-export-20250925_repair_log_fast.txt"
expected_cols = 83
# Precompile regex for triple quotes
triple_quote_re = re.compile(r'""+"')
def clean_cell(cell):
if isinstance(cell, str):
return triple_quote_re.sub('"', cell).strip()
return cell
print("🚀 Starting HIGH-SPEED CSV repair...")
with open(input_file, 'r', encoding='utf-8', errors='ignore') as infile, \
open(output_file, 'w', newline='', encoding='utf-8') as outfile, \
open(log_file, 'w', encoding='utf-8') as logfile:
writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
buffer = ""
row_num = 0
repaired_rows = 0
skipped_rows = 0
# tqdm without total for live progress
for line in tqdm(infile, desc="Processing CSV", unit="lines", dynamic_ncols=True):
buffer += line.rstrip('\n')
if buffer.count('"') % 2 == 0: # complete row
# Split by commas first (faster than csv.reader)
row = buffer.split(',')
row = [clean_cell(cell) for cell in row]
row_num += 1
buffer = ""
# Pad short rows
if len(row) < expected_cols:
row += [''] * (expected_cols - len(row))
repaired_rows += 1
# Log long rows
elif len(row) > expected_cols:
logfile.write(f"Row {row_num} has {len(row)} columns: {row}\n")
skipped_rows += 1
continue
writer.writerow(row)
else:
buffer += "\n" # keep newline for multi-line field
print("✅ HIGH-SPEED CSV repair complete!")
print(f"Total rows processed: {row_num}")
print(f"Repaired rows: {repaired_rows}, Skipped rows: {skipped_rows}")
print(f"Output file: {output_file}")
print(f"Log file: {log_file}")