Skip to content

Commit 9179021

Browse files
committed
DeepLabv3 for document detection
1 parent b61b324 commit 9179021

18 files changed

Lines changed: 2365 additions & 0 deletions

examples/DeepLabv3/.gitignore

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
.Python
7+
build/
8+
develop-eggs/
9+
dist/
10+
downloads/
11+
eggs/
12+
.eggs/
13+
lib/
14+
lib64/
15+
parts/
16+
sdist/
17+
var/
18+
wheels/
19+
*.egg-info/
20+
.installed.cfg
21+
*.egg
22+
23+
# Virtual Environment
24+
venv/
25+
ENV/
26+
env/
27+
28+
# IDE
29+
.vscode/
30+
.idea/
31+
*.swp
32+
*.swo
33+
*~
34+
35+
# OS
36+
.DS_Store
37+
Thumbs.db
38+
39+
# Project specific
40+
output/
41+
*.pth
42+
!model_mbv3_iou_mix_2C049.pth
43+
*.onnx
44+
!web_app/document_detector.onnx
45+
46+
# Logs
47+
*.log
48+
49+
# Jupyter
50+
.ipynb_checkpoints/
51+
*.ipynb
52+
53+
# Test files
54+
test_*.py
55+
*_test.py

examples/DeepLabv3/README.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Document Detection Application
2+
3+
A deep learning-based document boundary detection application using DeepLabV3 semantic segmentation with MobileNetV3-Large backbone. Available as both a desktop GUI (PySide6) and a web application (pure JavaScript with ONNX Runtime).
4+
5+
## Features
6+
7+
### Desktop Application (Python + PySide6)
8+
- 📁 **Image Upload**: Load and process images from your file system
9+
- 📷 **Real-time Webcam**: Live document detection from webcam feed
10+
- 🎯 **Accurate Detection**: DeepLabV3 with MobileNetV3-Large backbone
11+
- 📊 **Performance Metrics**: Real-time display of preprocessing, inference, and post-processing times
12+
- 💾 **Export Results**: Save overlay, mask, and cropped document images
13+
- 🖼️ **Interactive Viewer**: Zoom, pan, and fit-to-window controls
14+
15+
### Web Application (Pure JavaScript)
16+
- 🌐 **Browser-based**: Runs entirely in the browser using ONNX Runtime Web
17+
-**No Backend Required**: All inference happens client-side via WebAssembly
18+
- 📱 **Responsive Design**: Works on desktop and mobile devices
19+
- 🎨 **Modern UI**: Clean, premium interface with real-time metrics
20+
- 📸 **Webcam Support**: Real-time document detection in the browser
21+
22+
## Model Performance
23+
24+
**MobileNetV3-Large Backbone:**
25+
- **Parameters**: 11,020,594
26+
- **Input Size**: 384×384
27+
- **Inference Time** (CPU): ~180-210ms
28+
- **FPS**: ~4-5 (CPU), ~30+ (GPU)
29+
30+
## Installation
31+
32+
### Prerequisites
33+
- Python 3.8 or higher
34+
- pip package manager
35+
36+
### Installation
37+
38+
```bash
39+
pip install -r requirements.txt
40+
```
41+
42+
43+
## Usage
44+
45+
### Desktop Application
46+
47+
Run the desktop GUI application:
48+
49+
```bash
50+
python main.py
51+
```
52+
53+
**Controls:**
54+
- **Load Image**: Open an image file for processing
55+
- **Start Webcam**: Enable real-time webcam detection
56+
- **Process Image**: Manually process the current image
57+
- **Export Results**: Save processed images to disk
58+
- **Clear**: Reset the application state
59+
60+
### Web Application
61+
62+
1. **Generate ONNX model** (first time only):
63+
```bash
64+
python export_onnx.py
65+
```
66+
67+
2. **Start the web server**:
68+
```bash
69+
cd web_app
70+
python -m http.server
71+
```
72+
73+
3. **Open in browser**: Navigate to `http://localhost:8000`
74+
75+
The web app will load the ONNX model and run inference entirely in your browser using WebAssembly.
76+
77+
## Credits
78+
79+
This project is based on the LearnOpenCV article:
80+
[Deep Learning Based Document Segmentation Using Semantic Segmentation DeepLabV3 on Custom Dataset](https://learnopencv.com/deep-learning-based-document-segmentation-using-semantic-segmentation-deeplabv3-on-custom-dataset/) and [sampe code](https://github.com/spmallick/learnopencv/tree/master/Document-Scanner-Custom-Semantic-Segmentation-using-PyTorch-DeepLabV3).
81+

examples/DeepLabv3/config.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Configuration file for Document Detection Application
3+
"""
4+
import os
5+
from pathlib import Path
6+
7+
# Paths
8+
BASE_DIR = Path(__file__).parent
9+
OUTPUT_DIR = BASE_DIR / "output"
10+
11+
# Model Configuration
12+
MODELS = {
13+
'MobileNetV3-Large': {
14+
'path': BASE_DIR / "model_mbv3_iou_mix_2C049.pth",
15+
'backbone': 'mbv3',
16+
'description': 'DeepLabV3 with MobileNetV3-Large backbone'
17+
}
18+
}
19+
20+
DEFAULT_MODEL = 'MobileNetV3-Large'
21+
22+
MODEL_COMMON_CONFIG = {
23+
'num_classes': 2, # Background and Document
24+
'input_size': (384, 384), # As used in training
25+
}
26+
27+
# Image Preprocessing
28+
IMAGENET_MEAN = [0.485, 0.456, 0.406]
29+
IMAGENET_STD = [0.229, 0.224, 0.225]
30+
31+
# Device Configuration
32+
DEVICE_PREFERENCE = 'cpu' # 'cuda' or 'cpu'
33+
34+
# UI Configuration
35+
WINDOW_CONFIG = {
36+
'title': 'Document Detection - DeepLabV3',
37+
'width': 1400,
38+
'height': 900,
39+
'min_width': 1000,
40+
'min_height': 700,
41+
}
42+
43+
# Supported Image Formats
44+
SUPPORTED_FORMATS = [
45+
'*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.tif'
46+
]
47+
48+
# Visualization Colors (BGR format for OpenCV)
49+
COLORS = {
50+
'document_mask': (0, 255, 0), # Green
51+
'boundary': (0, 0, 255), # Red
52+
'corners': (255, 0, 0), # Blue
53+
'text': (255, 255, 255), # White
54+
}
55+
56+
# Performance Settings
57+
PERFORMANCE_CONFIG = {
58+
'history_size': 50, # Number of frames to keep in metrics history
59+
'webcam_fps': 30,
60+
}
61+
62+
# Create output directory if it doesn't exist
63+
OUTPUT_DIR.mkdir(exist_ok=True)

examples/DeepLabv3/export_onnx.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Script to export PyTorch model to ONNX format for web deployment
3+
"""
4+
import torch
5+
import torch.onnx
6+
from model_loader import load_model
7+
import config
8+
import sys
9+
from pathlib import Path
10+
11+
def export_to_onnx(model_name="MobileNetV3-Large", output_name="document_detector.onnx"):
12+
print(f"Loading model: {model_name}...")
13+
14+
# Load the model
15+
try:
16+
# Force CPU for export to avoid any CUDA dependencies in the exported graph
17+
model, _ = load_model(model_name, device='cpu')
18+
model.eval()
19+
except Exception as e:
20+
print(f"Error loading model: {e}")
21+
return False
22+
23+
# Create dummy input
24+
# Shape: (Batch_Size, Channels, Height, Width)
25+
# We use 1 for batch size, 3 for RGB channels, and 384x384 as input size
26+
dummy_input = torch.randn(1, 3, 384, 384, requires_grad=True)
27+
28+
output_path = config.BASE_DIR / "web_app" / output_name
29+
output_path.parent.mkdir(exist_ok=True)
30+
31+
print(f"Exporting to {output_path}...")
32+
33+
# Export
34+
try:
35+
torch.onnx.export(
36+
model, # model being run
37+
dummy_input, # model input (or a tuple for multiple inputs)
38+
str(output_path), # where to save the model
39+
export_params=True, # store the trained parameter weights inside the model file
40+
opset_version=12, # the ONNX version to export the model to
41+
do_constant_folding=True, # whether to execute constant folding for optimization
42+
input_names=['input'], # the model's input names
43+
output_names=['output'], # the model's output names
44+
dynamic_axes={ # variable length axes
45+
'input': {0: 'batch_size'},
46+
'output': {0: 'batch_size'}
47+
}
48+
)
49+
print("✅ Export successful!")
50+
return True
51+
except Exception as e:
52+
print(f"❌ Export failed: {e}")
53+
return False
54+
55+
if __name__ == "__main__":
56+
success = export_to_onnx()
57+
sys.exit(0 if success else 1)

examples/DeepLabv3/image_viewer.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""
2+
Custom image viewer widget with zoom and pan capabilities
3+
"""
4+
from PySide6.QtWidgets import QGraphicsView, QGraphicsScene, QGraphicsPixmapItem
5+
from PySide6.QtCore import Qt, Signal, QPointF
6+
from PySide6.QtGui import QPixmap, QWheelEvent, QMouseEvent
7+
import numpy as np
8+
from utils import numpy_to_qpixmap
9+
10+
class ImageViewer(QGraphicsView):
11+
"""Custom image viewer with zoom and pan"""
12+
13+
imageClicked = Signal(QPointF)
14+
15+
def __init__(self, parent=None):
16+
super().__init__(parent)
17+
18+
self.scene = QGraphicsScene(self)
19+
self.setScene(self.scene)
20+
21+
self.pixmap_item = None
22+
self.zoom_factor = 1.0
23+
self.min_zoom = 0.1
24+
self.max_zoom = 10.0
25+
26+
# Enable dragging
27+
self.setDragMode(QGraphicsView.ScrollHandDrag)
28+
29+
# Set rendering hints
30+
self.setRenderHint(self.renderHints())
31+
32+
# Disable scrollbars (we'll use them when needed)
33+
self.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
34+
self.setVerticalScrollBarPolicy(Qt.ScrollBarAsNeeded)
35+
36+
# Background
37+
self.setBackgroundBrush(Qt.darkGray)
38+
39+
def set_image(self, image: np.ndarray):
40+
"""
41+
Set image to display
42+
43+
Args:
44+
image: Numpy array image
45+
"""
46+
# Clear scene
47+
self.scene.clear()
48+
49+
# Convert to pixmap
50+
pixmap = numpy_to_qpixmap(image)
51+
52+
# Add to scene
53+
self.pixmap_item = QGraphicsPixmapItem(pixmap)
54+
self.scene.addItem(self.pixmap_item)
55+
56+
# Reset zoom
57+
self.zoom_factor = 1.0
58+
self.fit_in_view()
59+
60+
def set_pixmap(self, pixmap: QPixmap):
61+
"""Set QPixmap to display"""
62+
self.scene.clear()
63+
self.pixmap_item = QGraphicsPixmapItem(pixmap)
64+
self.scene.addItem(self.pixmap_item)
65+
self.zoom_factor = 1.0
66+
self.fit_in_view()
67+
68+
def fit_in_view(self):
69+
"""Fit image in view"""
70+
if self.pixmap_item:
71+
self.fitInView(self.pixmap_item, Qt.KeepAspectRatio)
72+
self.zoom_factor = 1.0
73+
74+
def zoom_in(self):
75+
"""Zoom in"""
76+
self.scale_view(1.25)
77+
78+
def zoom_out(self):
79+
"""Zoom out"""
80+
self.scale_view(0.8)
81+
82+
def scale_view(self, factor: float):
83+
"""Scale view by factor"""
84+
new_zoom = self.zoom_factor * factor
85+
86+
if new_zoom < self.min_zoom or new_zoom > self.max_zoom:
87+
return
88+
89+
self.scale(factor, factor)
90+
self.zoom_factor = new_zoom
91+
92+
def wheelEvent(self, event: QWheelEvent):
93+
"""Handle mouse wheel for zooming"""
94+
if event.angleDelta().y() > 0:
95+
self.scale_view(1.15)
96+
else:
97+
self.scale_view(0.85)
98+
99+
def mousePressEvent(self, event: QMouseEvent):
100+
"""Handle mouse press"""
101+
if event.button() == Qt.LeftButton:
102+
scene_pos = self.mapToScene(event.pos())
103+
self.imageClicked.emit(scene_pos)
104+
super().mousePressEvent(event)
105+
106+
def clear(self):
107+
"""Clear the viewer"""
108+
self.scene.clear()
109+
self.pixmap_item = None
110+
self.zoom_factor = 1.0

0 commit comments

Comments
 (0)