-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_recognition_pdf.py
More file actions
120 lines (99 loc) · 4.94 KB
/
process_recognition_pdf.py
File metadata and controls
120 lines (99 loc) · 4.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import os
import fitz # PyMuPDF
from pdf2docx import Converter
from auxiliary_funcs import get_current_time
import csv
from operation_with_settings import get_data_settings
class RecognitionProcessingPDF:
def __init__(self, pdf_path):
self.pdf_path = pdf_path
# Open and read the JSON (settings) file
data = get_data_settings()
self.values_setting = data["pdf_settings"]
save_path = data["save_path"]["for_pdf"]
self.create_save_directories(save_path)
self.errors = [] # List of errors (time, str)
def create_save_directories(self, directory):
self.name_save_one_pdf = self.pdf_path.split(".")[0].split("/")[-1] # the directory of files of one PDF
self.path_save_one = directory + "/" + self.name_save_one_pdf
if not os.path.isdir(self.path_save_one):
os.makedirs(self.path_save_one)
dir_list = [] # the directories of files of one PDF
if self.values_setting["SAVE_IMAGES"]:
dir_list.append("images")
if self.values_setting["SAVE_TABLES"]:
dir_list.append("tables")
for dir in dir_list:
if not os.path.isdir(self.path_save_one + "/" + dir):
os.makedirs(self.path_save_one + "/" + dir)
@property
def recognize(self):
"""Determines by settings what needs to be extracted from the PDF file"""
text = self.recognize_text()
if self.values_setting["SAVE_TABLES"]:
self.error_correction(self.save_tables, "The problem with the encoding of the table",
"The problem with the permission of the dir for table")
if self.values_setting["SAVE_TXT_FILE"]:
self.error_correction(self.convert_to_TXT, "The problem with the encoding of the text",
"The problem with the permission of the dir for text")
if self.values_setting["SAVE_IMAGES"]:
self.error_correction(self.save_images, "The problem with the encoding of the image",
"The problem with the permission of the dir for image")
if self.values_setting["SAVE_WORD_FILE"]:
self.error_correction(self.convert_to_WORD, "The problem with the encoding of the word",
"The problem with the permission of the dir for word")
return text, self.errors
def error_correction(self, func, mes_unicode, mes_permission):
try:
func() # Execute the function
except UnicodeEncodeError:
self.errors.append((get_current_time(), mes_unicode))
except PermissionError:
self.errors.append((get_current_time(), mes_permission))
def convert_to_WORD(self):
# Create a Converter object
cv = Converter(self.pdf_path)
# Converting the specified PDF page to docx
cv.convert(self.path_save_one + "/" + self.name_save_one_pdf + ".docx", start=0, end=None)
cv.close()
def convert_to_TXT(self):
"""Converts the PDF file to .txt format."""
with open(self.path_save_one + "/" + self.name_save_one_pdf + ".txt", "w", encoding="utf-8") as file:
file.write(self.recognize_text())
def save_images(self):
with fitz.open(self.pdf_path) as file:
# iterate over PDF pages
for page_index in range(len(file)):
# get the page itself
page = file.load_page(page_index) # load the page
image_list = page.get_images(full=True) # get images on the page
for image_index, img in enumerate(image_list, start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = file.extract_image(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# save the image
image_name = self.path_save_one + "/images/" + f"{page_index}_{image_index}.{image_ext}"
with open(image_name, "wb") as image_file:
image_file.write(image_bytes)
def save_tables(self):
with fitz.open(self.pdf_path) as file:
for n, page in enumerate(file):
tabs = page.find_tables()
if tabs.tables:
self.write_to_csv(tabs[0].extract(), n)
def write_to_csv(self, array, page):
with open(self.path_save_one + "/tables/" + str(page) + ".csv", "w", encoding="utf-8") as file:
writer = csv.writer(file, delimiter=";", quoting=csv.QUOTE_ALL)
for row in array:
writer.writerow(row)
def recognize_text(self):
with fitz.open(self.pdf_path) as file:
text = ""
for page in file: # Iterate through the pages
text += page.get_text()
return text