-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv-excel-processor_events_output_check_duplicate_hard_id.py
More file actions
51 lines (41 loc) · 1.52 KB
/
csv-excel-processor_events_output_check_duplicate_hard_id.py
File metadata and controls
51 lines (41 loc) · 1.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
"""
Check for duplicate TICKET_IDs across all event_output files (chunked)
This version handles large files efficiently using chunks and shows progress bars.
"""
import pandas as pd
import os
from tqdm import tqdm
INPUT_FOLDER = "/home/steftzor/Downloads/output_csv-files"
CHUNK_SIZE = 10000 # rows per chunk
def main():
ticket_ids = set()
duplicates = set()
files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith("_event_output.csv")]
if not files:
print("No event output files found.")
return
for file in files:
file_path = os.path.join(INPUT_FOLDER, file)
print(f"\nChecking {file_path}...")
# Get total rows for progress bar if CSV
total_rows = sum(1 for _ in open(file_path)) - 1 if file_path.endswith(".csv") else None
pbar = tqdm(total=total_rows, desc="Rows processed", unit="row")
# Read file in chunks
reader = pd.read_csv(file_path, usecols=["TICKET_ID"], chunksize=CHUNK_SIZE)
for chunk in reader:
for tid in chunk["TICKET_ID"]:
if tid in ticket_ids:
duplicates.add(tid)
else:
ticket_ids.add(tid)
pbar.update(len(chunk))
pbar.close()
if duplicates:
print(f"\nDuplicate TICKET_IDs found ({len(duplicates)}):")
for dup in duplicates:
print(dup)
else:
print("\nNo duplicate TICKET_IDs found. All ticket IDs are unique.")
if __name__ == "__main__":
main()