Skip to content
This repository was archived by the owner on Nov 19, 2024. It is now read-only.

Commit 25c7bbb

Browse files
committed
test refactoring
1 parent b6efd42 commit 25c7bbb

5 files changed

Lines changed: 157 additions & 74 deletions

File tree

TODO.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
# TODO list
22

3-
+ get rid of tensorflow-cpu in tests
43
+ Auto value for `-D INF_ENGINE_RELEASE`: https://github.com/openvinotoolkit/openvino/issues/1435
54
+ <https://answers.opencv.org/question/236271/what-the-difference-between-cv_version_status-values/>
65
+ `ENABLE_AVX512F`, how often you see such CPUs in clouds?
76
+ `avresample` from ffmpeg to the opencv, do we need it?
8-
+ `se_net.xml` downloads badly
97
+ results of `test_inference_engine()[::-1]`

tests/examples.ipynb

Lines changed: 66 additions & 43 deletions
Large diffs are not rendered by default.

tests/pixellink.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,24 @@
55
"""
66
import cv2
77
import numpy as np
8-
from scipy.special import softmax
98
from skimage.morphology import label
109
from skimage.measure import regionprops
1110
from typing import List, Tuple
1211
from skimage.measure._regionprops import RegionProperties
1312

1413

1514
class PixelLinkDetector():
16-
""" Wrapper class for Intel's version of PixelLink text-detection-0001
15+
""" Wrapper class for Intel's version of PixelLink text detector
16+
17+
See https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/intel/ \
18+
text-detection-0004/description/text-detection-0004.md
19+
1720
:param xml_model_path: path to XML file
1821
1922
**Example:**
2023
2124
.. code-block:: python
22-
detector = PixelLinkDetector('text-detection-0002.xml')
25+
detector = PixelLinkDetector('text-detection-0004.xml')
2326
img = cv2.imread('tmp.jpg')
2427
# ~250ms on i7-6700K
2528
detector.detect(img)
@@ -35,7 +38,15 @@ def __init__(self, xml_model_path: str, txt_threshold=0.5):
3538
self._txt_threshold = txt_threshold
3639

3740
def detect(self, img: np.ndarray) -> None:
38-
""" GetPixelLink's outputs
41+
""" GetPixelLink's outputs (BxCxHxW):
42+
+ [1x16x192x320] - logits related to linkage between pixels and their neighbors
43+
+ [1x2x192x320] - logits related to text/no-text classification for each pixel
44+
45+
B - batch size
46+
C - number of channels
47+
H - image height
48+
W - image width
49+
3950
:param img: image as ``numpy.ndarray``
4051
"""
4152
self._img_shape = img.shape
@@ -47,29 +58,48 @@ def detect(self, img: np.ndarray) -> None:
4758
# for text-detection-00[34]
4859
self.links, self.pixels = self._net.forward(out_layer_names)
4960

50-
def get_mask(self) -> np.array:
61+
def get_mask(self) -> np.ndarray:
5162
""" Get binary mask of detected text pixels
5263
"""
5364
pixel_mask = self._get_pixel_scores() >= self._txt_threshold
5465
return pixel_mask.astype(np.uint8)
5566

56-
def _get_pixel_scores(self) -> np.array:
57-
"get softmaxed properly shaped pixel scores"
67+
def _logsumexp(self, a: np.ndarray, axis=-1) -> np.ndarray:
68+
""" Castrated function from scipy
69+
https://github.com/scipy/scipy/blob/v1.6.2/scipy/special/_logsumexp.py
70+
71+
Compute the log of the sum of exponentials of input elements.
72+
"""
73+
a_max = np.amax(a, axis=axis, keepdims=True)
74+
tmp = np.exp(a - a_max)
75+
s = np.sum(tmp, axis=axis, keepdims=True)
76+
out = np.log(s)
77+
out += a_max
78+
return out
79+
80+
def _get_pixel_scores(self) -> np.ndarray:
81+
""" get softmaxed properly shaped pixel scores """
82+
# move channels to the end
5883
tmp = np.transpose(self.pixels, (0, 2, 3, 1))
59-
return softmax(tmp, axis=-1)[0, :, :, 1]
84+
# softmax from scipy
85+
tmp = np.exp(tmp - self._logsumexp(tmp, axis=-1))
86+
# select single batch, single chanel values
87+
return tmp[0, :, :, 1]
6088

61-
def _get_txt_regions(self, pixel_mask: np.array) -> List[RegionProperties]:
62-
"kernels are class dependent"
89+
def _get_txt_regions(self, pixel_mask: np.ndarray) -> List[RegionProperties]:
90+
""" kernels are class dependent """
6391
img_h, img_w = self._img_shape[:2]
6492
_, mask = cv2.threshold(pixel_mask, 0, 1, cv2.THRESH_BINARY)
6593
# transmutatioins
6694
# kernel size should be image size dependant (default (21,21))
6795
# on small image it will connect separate words
6896
txt_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
6997
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, txt_kernel)
70-
# label regions on mask of original img size
98+
# connect regions on mask of original img size
7199
mask = cv2.resize(mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST)
100+
# Label connected regions of an integer array
72101
mask = label(mask, background=0, connectivity=2)
102+
# Measure properties of labeled image regions.
73103
txt_regions = regionprops(mask)
74104
return txt_regions
75105

@@ -99,4 +129,6 @@ def decode(self) -> List[Tuple[int, int, int, int]]:
99129
"""
100130
mask = self.get_mask()
101131
bboxes = self._get_txt_bboxes(self._get_txt_regions(mask))
132+
# sort by xmin, ymin
133+
bboxes = sorted(bboxes, key=lambda x: (x[1], x[0]))
102134
return bboxes

tests/requirements.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
1-
tensorflow-cpu>=2.4.1
2-
numpy==1.19.2
3-
scipy==1.4.1
41
scikit-image
5-
ipython

tests/text_recognition.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,33 @@
11
import cv2
22
import numpy as np
3-
import tensorflow as tf # 2.0
43
from typing import List
54

65

76
class TextRecognizer():
87
def __init__(self, xml_model_path: str):
9-
"""
8+
""" Class for the Intels' OCR model pipeline
9+
10+
See https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/intel/ \
11+
text-recognition-0012/description/text-recognition-0012.md
12+
1013
:param xml_model_path: path to model's XML file
1114
"""
15+
# load model
1216
self._net = cv2.dnn.readNetFromModelOptimizer(xml_model_path, xml_model_path[:-3] + 'bin')
1317

14-
def _get_ocr_pred(self, img: np.ndarray, box: tuple) -> np.ndarray:
15-
"get OCR prediction from part of image in memory"
18+
def _get_confidences(self, img: np.ndarray, box: tuple) -> np.ndarray:
19+
""" get OCR prediction confidences from a part of image in memory
20+
21+
:param img: BGR image
22+
:param box: (ymin ,xmin ,ymax, xmax)
23+
24+
:return: blob with the shape [30, 1, 37] in the format [WxBxL], where:
25+
W - output sequence length
26+
B - batch size
27+
L - confidence distribution across alphanumeric symbols:
28+
"0123456789abcdefghijklmnopqrstuvwxyz#", where # - special
29+
blank character for CTC decoding algorithm.
30+
"""
1631
y1, x1, y2, x2 = box
1732
img = img[y1:y2, x1:x2]
1833
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@@ -22,17 +37,36 @@ def _get_ocr_pred(self, img: np.ndarray, box: tuple) -> np.ndarray:
2237
return outs
2338

2439
def do_ocr(self, img: np.ndarray, bboxes: List[tuple]) -> List[str]:
25-
answer = []
40+
""" Run OCR pipeline for a single words
41+
42+
:param img: BGR image
43+
:param bboxes: list of sepaate word bboxes (ymin ,xmin ,ymax, xmax)
44+
45+
:return: recognized words
46+
47+
For TF version use:
48+
49+
.. code-block:: python
50+
51+
# 30 is `confs.shape[0]` it is fixed
52+
a, b = tf.nn.ctc_beam_search_decoder(confs, np.array([30]))
53+
idx_no_blanks = tf.sparse.to_dense(a[0])[0].numpy()
54+
word = ''.join(char_vec[idxs_no_blanks])
55+
"""
56+
words = []
2657
# net could detect only these chars
2758
char_vec = np.array(list("0123456789abcdefghijklmnopqrstuvwxyz#"))
2859

2960
for box in bboxes:
30-
outs = self._get_ocr_pred(img, box)
31-
# The network output can be decoded by CTC Greedy Decoder or CTC Beam Search decoder.
32-
# 30 is outs,shape[0] it is fixed
33-
a, b = tf.nn.ctc_beam_search_decoder(outs, np.array([30]))
34-
#a, b = tf.nn.ctc_greedy_decoder(outs, np.array([30]), merge_repeated=True)
35-
36-
ff = tf.sparse.to_dense(a[0])[0].numpy()
37-
answer.append("".join([char_vec[i] for i in ff]))
38-
return answer
61+
# confidence distribution across symbols
62+
confs = self._get_confidences(img, box)
63+
# get maximal confidence for the whole beam width
64+
idxs = confs[:, 0, :].argmax(axis=1)
65+
# drop blank characters '#' with id == 36 in charvec
66+
# isupposedly we taking only separate words as input
67+
idxs_no_blanks = idxs[idxs != 36]
68+
# joint to string
69+
word = ''.join(char_vec[idxs_no_blanks])
70+
words.append(word)
71+
72+
return words

0 commit comments

Comments
 (0)