banderlog
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎TODO.md‎
Lines changed: 2 additions & 5 deletions b/‎TODO.md‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎tests/README.md‎
Lines changed: 8 additions & 0 deletions b/‎tests/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/dislike.jpg‎
98.4 KB b/‎tests/dislike.jpg‎
98.4 KB
diff --git a/‎tests/helloworld.png‎
12.7 KB b/‎tests/helloworld.png‎
12.7 KB
diff --git a/‎tests/pixellink.py‎
Lines changed: 102 additions & 0 deletions b/‎tests/pixellink.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎tests/rateme/__init__.py‎ b/‎tests/rateme/__init__.py‎
diff --git a/‎tests/rateme/rateme.cfg‎
Lines changed: 182 additions & 0 deletions b/‎tests/rateme/rateme.cfg‎
Lines changed: 182 additions & 0 deletions
@@ -1,3 +1,5 @@
+*cache*
+.ipynb_checkpoints
 build/*
 !build/opencv
 build/opencv/*
@@ -27,3 +29,5 @@ venv
 *.swp
 *.whl
 TODO.txt
+*.bin
+*.weights
@@ -2,9 +2,6 @@
 
 **TESTS**:
 
-+ Video open
-+ Webcam open
-+ dnn module network loading (YOLO or something)
-+ IE model loading (e.g. PixelLink)
++ Automatize model weights downloading
++ Webcam open?
 
-Check <https://hackmd.io/@banderlog/H1nXBmsYB> and organize them in a separate folder.
@@ -0,0 +1,8 @@
+The [rateme](https://github.com/heyml/rateme) is YOLO3 actually.
+
+These are Intel's models: [text-detection-0004](https://github.com/opencv/open_model_zoo/blob/master/models/intel/text-detection-0004/description/text-detection-0004.md) and [text-recognition-0012](https://github.com/opencv/open_model_zoo/blob/master/models/intel/text-recognition-0012/description/text-recognition-0012.md).
+
+Video from here (free): <https://www.pexels.com/video/a-cattails-fluff-floats-in-air-2156021/>
+
+
+**MODEL WEIGHTS SHOULD BE DOWNLOADED SEPARATELY (for now)**
@@ -0,0 +1,102 @@
+""" Wrapper class for Intel's PixelLink realisation (text segmentation NN)
+    text-detection-00[34]
+
+    For text-detection-002 you'll need to uncomment string in detect()
+"""
+import cv2
+import numpy as np
+from scipy.special import softmax
+from skimage.morphology import label
+from skimage.measure import regionprops
+from typing import List, Tuple
+from skimage.measure._regionprops import RegionProperties
+
+
+class PixelLinkDetector():
+    """ Wrapper class for Intel's version of PixelLink text-detection-0001
+        :param xml_model_path: path to XML file
+
+        **Example:**
+
+        .. code-block:: python
+            detector = PixelLinkDetector('text-detection-0002.xml')
+            img = cv2.imread('tmp.jpg')
+            # ~250ms on i7-6700K
+            detector.detect(img)
+            # ~2ms
+            bboxes = detector.decode()
+    """
+    def __init__(self, xml_model_path: str, txt_threshold=0.5):
+        """
+            :param xml_model_path: path to model's XML file
+            :param txt_threshold: confidence, defaults to ``0.5``
+        """
+        self._net = cv2.dnn.readNet(xml_model_path, xml_model_path[:-3] + 'bin')
+        self._txt_threshold = txt_threshold
+
+    def detect(self, img: np.ndarray) -> None:
+        """ GetPixelLink's outputs
+            :param img: image as ``numpy.ndarray``
+        """
+        self._img_shape = img.shape
+        blob = cv2.dnn.blobFromImage(img, 1, (1280, 768))
+        self._net.setInput(blob)
+        out_layer_names = self._net.getUnconnectedOutLayersNames()
+        # for text-detection-002
+        # self.pixels, self.links = self._net.forward(out_layer_names)
+        # for text-detection-00[34]
+        self.links, self.pixels = self._net.forward(out_layer_names)
+
+    def get_mask(self) -> np.array:
+        """ Get binary mask of detected text pixels
+        """
+        pixel_mask = self._get_pixel_scores() >= self._txt_threshold
+        return pixel_mask.astype(np.uint8)
+
+    def _get_pixel_scores(self) -> np.array:
+        "get softmaxed properly shaped pixel scores"
+        tmp = np.transpose(self.pixels, (0, 2, 3, 1))
+        return softmax(tmp, axis=-1)[0, :, :, 1]
+
+    def _get_txt_regions(self, pixel_mask: np.array) -> List[RegionProperties]:
+        "kernels are class dependent"
+        img_h, img_w = self._img_shape[:2]
+        _, mask = cv2.threshold(pixel_mask, 0, 1, cv2.THRESH_BINARY)
+        # transmutatioins
+        # kernel size should be image size dependant (default (21,21))
+        # on small image it will connect separate words
+        txt_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, txt_kernel)
+        # label regions on mask of original img size
+        mask = cv2.resize(mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST)
+        mask = label(mask, background=0, connectivity=2)
+        txt_regions = regionprops(mask)
+        return txt_regions
+
+    def _get_txt_bboxes(self, txt_regions: List[RegionProperties]) -> List[Tuple[int, int, int, int]]:
+        """ Filter text area by area and height
+
+            :return: ``[(ymin, xmin, ymax, xmax)]``
+        """
+        min_area = 0
+        min_height = 4
+        boxes = []
+        for p in txt_regions:
+            if p.area > min_area:
+                bbox = p.bbox
+                if (bbox[2] - bbox[0]) > min_height:
+                    boxes.append(bbox)
+        return boxes
+
+    def decode(self) -> List[Tuple[int, int, int, int]]:
+        """ Decode PixelLink's output
+
+            :return: bounding_boxes
+
+            .. note::
+                bounding_boxes format: [ymin ,xmin ,ymax, xmax]
+
+        """
+        mask = self.get_mask()
+        bboxes = self._get_txt_bboxes(self._get_txt_regions(mask))
+        return bboxes
@@ -0,0 +1,182 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=32
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 4000
+policy=steps
+steps=3200,3600
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=1
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=21
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=2
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 8
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=21
+activation=linear
+
+[yolo]
+mask = 0,1,2
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=2
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1