Application-toText/process_recognition_pdf.py at master · IvanGaideek/Application-toText · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import os

import fitz  # PyMuPDF
from pdf2docx import Converter
from auxiliary_funcs import get_current_time
import csv

from operation_with_settings import get_data_settings


class RecognitionProcessingPDF:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        # Open and read the JSON (settings) file
        data = get_data_settings()
        self.values_setting = data["pdf_settings"]
        save_path = data["save_path"]["for_pdf"]
        self.create_save_directories(save_path)
        self.errors = []  # List of errors (time, str)

    def create_save_directories(self, directory):
        self.name_save_one_pdf = self.pdf_path.split(".")[0].split("/")[-1]  # the directory of files of one PDF
        self.path_save_one = directory + "/" + self.name_save_one_pdf
        if not os.path.isdir(self.path_save_one):
            os.makedirs(self.path_save_one)

        dir_list = []  # the directories of files of one PDF
        if self.values_setting["SAVE_IMAGES"]:
            dir_list.append("images")
        if self.values_setting["SAVE_TABLES"]:
            dir_list.append("tables")

        for dir in dir_list:
            if not os.path.isdir(self.path_save_one + "/" + dir):
                os.makedirs(self.path_save_one + "/" + dir)

    @property
    def recognize(self):
        """Determines by settings what needs to be extracted from the PDF file"""
        text = self.recognize_text()
        if self.values_setting["SAVE_TABLES"]:
            self.error_correction(self.save_tables, "The problem with the encoding of the table",
                                  "The problem with the permission of the dir for table")
        if self.values_setting["SAVE_TXT_FILE"]:
            self.error_correction(self.convert_to_TXT, "The problem with the encoding of the text",
                                  "The problem with the permission of the dir for text")
        if self.values_setting["SAVE_IMAGES"]:
            self.error_correction(self.save_images, "The problem with the encoding of the image",
                                  "The problem with the permission of the dir for image")
        if self.values_setting["SAVE_WORD_FILE"]:
            self.error_correction(self.convert_to_WORD, "The problem with the encoding of the word",
                                  "The problem with the permission of the dir for word")
        return text, self.errors

    def error_correction(self, func, mes_unicode, mes_permission):
        try:
            func()  # Execute the function
        except UnicodeEncodeError:
            self.errors.append((get_current_time(), mes_unicode))
        except PermissionError:
            self.errors.append((get_current_time(), mes_permission))

    def convert_to_WORD(self):
        # Create a Converter object
        cv = Converter(self.pdf_path)

        # Converting the specified PDF page to docx
        cv.convert(self.path_save_one + "/" + self.name_save_one_pdf + ".docx", start=0, end=None)
        cv.close()

    def convert_to_TXT(self):
        """Converts the PDF file to .txt format."""
        with open(self.path_save_one + "/" + self.name_save_one_pdf + ".txt", "w", encoding="utf-8") as file:
            file.write(self.recognize_text())

    def save_images(self):
        with fitz.open(self.pdf_path) as file:
            # iterate over PDF pages
            for page_index in range(len(file)):
                # get the page itself
                page = file.load_page(page_index)  # load the page
                image_list = page.get_images(full=True)  # get images on the page

                for image_index, img in enumerate(image_list, start=1):
                    # get the XREF of the image
                    xref = img[0]

                    # extract the image bytes
                    base_image = file.extract_image(xref)
                    image_bytes = base_image["image"]

                    # get the image extension
                    image_ext = base_image["ext"]

                    # save the image
                    image_name = self.path_save_one + "/images/" + f"{page_index}_{image_index}.{image_ext}"
                    with open(image_name, "wb") as image_file:
                        image_file.write(image_bytes)

    def save_tables(self):
        with fitz.open(self.pdf_path) as file:
            for n, page in enumerate(file):
                tabs = page.find_tables()
                if tabs.tables:
                    self.write_to_csv(tabs[0].extract(), n)

    def write_to_csv(self, array, page):
        with open(self.path_save_one + "/tables/" + str(page) + ".csv", "w", encoding="utf-8") as file:
            writer = csv.writer(file, delimiter=";", quoting=csv.QUOTE_ALL)
            for row in array:
                writer.writerow(row)

    def recognize_text(self):
        with fitz.open(self.pdf_path) as file:
            text = ""
            for page in file:  # Iterate through the pages
                text += page.get_text()

            return text