python_scripts/csv-excel-processor_events_output_check_duplicate_hard_id.py at main · StefTzor/python_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
"""
Check for duplicate TICKET_IDs across all event_output files (chunked)

This version handles large files efficiently using chunks and shows progress bars.
"""

import pandas as pd
import os
from tqdm import tqdm

INPUT_FOLDER = "/home/steftzor/Downloads/output_csv-files"
CHUNK_SIZE = 10000  # rows per chunk

def main():
    ticket_ids = set()
    duplicates = set()

    files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith("_event_output.csv")]
    if not files:
        print("No event output files found.")
        return

    for file in files:
        file_path = os.path.join(INPUT_FOLDER, file)
        print(f"\nChecking {file_path}...")

        # Get total rows for progress bar if CSV
        total_rows = sum(1 for _ in open(file_path)) - 1 if file_path.endswith(".csv") else None
        pbar = tqdm(total=total_rows, desc="Rows processed", unit="row")

        # Read file in chunks
        reader = pd.read_csv(file_path, usecols=["TICKET_ID"], chunksize=CHUNK_SIZE)
        for chunk in reader:
            for tid in chunk["TICKET_ID"]:
                if tid in ticket_ids:
                    duplicates.add(tid)
                else:
                    ticket_ids.add(tid)
            pbar.update(len(chunk))
        pbar.close()

    if duplicates:
        print(f"\nDuplicate TICKET_IDs found ({len(duplicates)}):")
        for dup in duplicates:
            print(dup)
    else:
        print("\nNo duplicate TICKET_IDs found. All ticket IDs are unique.")

if __name__ == "__main__":
    main()