test refactoring

banderlog · banderlog · commit 25c7bbb5c476 · 2021-04-13T16:32:43.000+03:00
diff --git a/TODO.md b/TODO.md
@@ -1,9 +1,7 @@
 # TODO list
 
-+ get rid of tensorflow-cpu in tests
 + Auto value for `-D INF_ENGINE_RELEASE`: https://github.com/openvinotoolkit/openvino/issues/1435
 + <https://answers.opencv.org/question/236271/what-the-difference-between-cv_version_status-values/>
 + `ENABLE_AVX512F`, how often you see such CPUs in clouds?
 + `avresample` from ffmpeg to the opencv, do we need it?
-+ `se_net.xml` downloads badly
 + results of `test_inference_engine()[::-1]`
diff --git a/tests/examples.ipynb b/tests/examples.ipynb
diff --git a/tests/pixellink.py b/tests/pixellink.py
@@ -5,21 +5,24 @@
 """
 import cv2
 import numpy as np
-from scipy.special import softmax
 from skimage.morphology import label
 from skimage.measure import regionprops
 from typing import List, Tuple
 from skimage.measure._regionprops import RegionProperties
 
 
 class PixelLinkDetector():
-    """ Wrapper class for Intel's version of PixelLink text-detection-0001
+    """ Wrapper class for Intel's version of PixelLink text detector
+
+        See https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/intel/ \
+            text-detection-0004/description/text-detection-0004.md
+
         :param xml_model_path: path to XML file
 
         **Example:**
 
         .. code-block:: python
-            detector = PixelLinkDetector('text-detection-0002.xml')
+            detector = PixelLinkDetector('text-detection-0004.xml')
             img = cv2.imread('tmp.jpg')
             # ~250ms on i7-6700K
             detector.detect(img)
@@ -35,7 +38,15 @@ def __init__(self, xml_model_path: str, txt_threshold=0.5):
         self._txt_threshold = txt_threshold
 
     def detect(self, img: np.ndarray) -> None:
-        """ GetPixelLink's outputs
+        """ GetPixelLink's outputs (BxCxHxW):
+                + [1x16x192x320] - logits related to linkage between pixels and their neighbors
+                + [1x2x192x320] - logits related to text/no-text classification for each pixel
+
+            B - batch size
+            C - number of channels
+            H - image height
+            W - image width
+
             :param img: image as ``numpy.ndarray``
         """
         self._img_shape = img.shape
@@ -47,29 +58,48 @@ def detect(self, img: np.ndarray) -> None:
         # for text-detection-00[34]
         self.links, self.pixels = self._net.forward(out_layer_names)
 
-    def get_mask(self) -> np.array:
+    def get_mask(self) -> np.ndarray:
         """ Get binary mask of detected text pixels
         """
         pixel_mask = self._get_pixel_scores() >= self._txt_threshold
         return pixel_mask.astype(np.uint8)
 
-    def _get_pixel_scores(self) -> np.array:
-        "get softmaxed properly shaped pixel scores"
+    def _logsumexp(self, a: np.ndarray, axis=-1) -> np.ndarray:
+        """ Castrated function from scipy
+            https://github.com/scipy/scipy/blob/v1.6.2/scipy/special/_logsumexp.py
+
+            Compute the log of the sum of exponentials of input elements.
+        """
+        a_max = np.amax(a, axis=axis, keepdims=True)
+        tmp = np.exp(a - a_max)
+        s = np.sum(tmp, axis=axis, keepdims=True)
+        out = np.log(s)
+        out += a_max
+        return out
+
+    def _get_pixel_scores(self) -> np.ndarray:
+        """ get softmaxed properly shaped pixel scores """
+        # move channels to the end
         tmp = np.transpose(self.pixels, (0, 2, 3, 1))
-        return softmax(tmp, axis=-1)[0, :, :, 1]
+        # softmax from scipy
+        tmp = np.exp(tmp - self._logsumexp(tmp, axis=-1))
+        # select single batch, single chanel values
+        return tmp[0, :, :, 1]
 
-    def _get_txt_regions(self, pixel_mask: np.array) -> List[RegionProperties]:
-        "kernels are class dependent"
+    def _get_txt_regions(self, pixel_mask: np.ndarray) -> List[RegionProperties]:
+        """ kernels are class dependent """
         img_h, img_w = self._img_shape[:2]
         _, mask = cv2.threshold(pixel_mask, 0, 1, cv2.THRESH_BINARY)
         # transmutatioins
         # kernel size should be image size dependant (default (21,21))
         # on small image it will connect separate words
         txt_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
         mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, txt_kernel)
-        # label regions on mask of original img size
+        # connect regions on mask of original img size
         mask = cv2.resize(mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST)
+        # Label connected regions of an integer array
         mask = label(mask, background=0, connectivity=2)
+        # Measure properties of labeled image regions.
         txt_regions = regionprops(mask)
         return txt_regions
 
@@ -99,4 +129,6 @@ def decode(self) -> List[Tuple[int, int, int, int]]:
         """
         mask = self.get_mask()
         bboxes = self._get_txt_bboxes(self._get_txt_regions(mask))
+        # sort by xmin, ymin
+        bboxes = sorted(bboxes, key=lambda x: (x[1], x[0]))
         return bboxes
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,5 +1 @@
-tensorflow-cpu>=2.4.1
-numpy==1.19.2
-scipy==1.4.1
 scikit-image
-ipython
diff --git a/tests/text_recognition.py b/tests/text_recognition.py
@@ -1,18 +1,33 @@
 import cv2
 import numpy as np
-import tensorflow as tf  # 2.0
 from typing import List
 
 
 class TextRecognizer():
     def __init__(self, xml_model_path: str):
-        """
+        """ Class for the Intels' OCR model pipeline
+
+            See https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/intel/ \
+                text-recognition-0012/description/text-recognition-0012.md
+
             :param xml_model_path: path to model's XML file
         """
+        # load model
         self._net = cv2.dnn.readNetFromModelOptimizer(xml_model_path, xml_model_path[:-3] + 'bin')
 
-    def _get_ocr_pred(self, img: np.ndarray, box: tuple) -> np.ndarray:
-        "get OCR prediction from part of image in memory"
+    def _get_confidences(self, img: np.ndarray, box: tuple) -> np.ndarray:
+        """ get OCR prediction confidences from a part of image in memory
+
+            :param img: BGR image
+            :param box: (ymin ,xmin ,ymax, xmax)
+
+            :return: blob with the shape [30, 1, 37] in the format [WxBxL], where:
+                    W - output sequence length
+                    B - batch size
+                    L - confidence distribution across alphanumeric symbols:
+                        "0123456789abcdefghijklmnopqrstuvwxyz#", where # - special
+                        blank character for CTC decoding algorithm.
+        """
         y1, x1, y2, x2 = box
         img = img[y1:y2, x1:x2]
         img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@@ -22,17 +37,36 @@ def _get_ocr_pred(self, img: np.ndarray, box: tuple) -> np.ndarray:
         return outs
 
     def do_ocr(self, img: np.ndarray, bboxes: List[tuple]) -> List[str]:
-        answer = []
+        """ Run OCR pipeline for a single words
+
+            :param img: BGR image
+            :param bboxes: list of sepaate word bboxes (ymin ,xmin ,ymax, xmax)
+
+            :return: recognized words
+
+            For TF version use:
+
+            .. code-block:: python
+
+                # 30 is `confs.shape[0]` it is fixed
+                a, b = tf.nn.ctc_beam_search_decoder(confs, np.array([30]))
+                idx_no_blanks = tf.sparse.to_dense(a[0])[0].numpy()
+                word = ''.join(char_vec[idxs_no_blanks])
+        """
+        words = []
         # net could detect only these chars
         char_vec = np.array(list("0123456789abcdefghijklmnopqrstuvwxyz#"))
 
         for box in bboxes:
-            outs = self._get_ocr_pred(img, box)
-            # The network output can be decoded by CTC Greedy Decoder or CTC Beam Search decoder.
-            # 30 is outs,shape[0] it is fixed
-            a, b = tf.nn.ctc_beam_search_decoder(outs, np.array([30]))
-            #a, b = tf.nn.ctc_greedy_decoder(outs, np.array([30]), merge_repeated=True)
-
-            ff = tf.sparse.to_dense(a[0])[0].numpy()
-            answer.append("".join([char_vec[i] for i in ff]))
-        return answer
+            # confidence distribution across symbols
+            confs = self._get_confidences(img, box)
+            # get maximal confidence for the whole beam width
+            idxs = confs[:, 0, :].argmax(axis=1)
+            # drop blank characters '#' with id == 36 in charvec
+            # isupposedly we taking only separate words as input
+            idxs_no_blanks = idxs[idxs != 36]
+            # joint to string
+            word = ''.join(char_vec[idxs_no_blanks])
+            words.append(word)
+
+        return words