yushulx
diff --git a/‎examples/DeepLabv3/.gitignore‎
Lines changed: 55 additions & 0 deletions b/‎examples/DeepLabv3/.gitignore‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎examples/DeepLabv3/README.md‎
Lines changed: 81 additions & 0 deletions b/‎examples/DeepLabv3/README.md‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/DeepLabv3/config.py‎
Lines changed: 63 additions & 0 deletions b/‎examples/DeepLabv3/config.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎examples/DeepLabv3/export_onnx.py‎
Lines changed: 57 additions & 0 deletions b/‎examples/DeepLabv3/export_onnx.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎examples/DeepLabv3/image_viewer.py‎
Lines changed: 110 additions & 0 deletions b/‎examples/DeepLabv3/image_viewer.py‎
Lines changed: 110 additions & 0 deletions
@@ -0,0 +1,55 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+ENV/
+env/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+output/
+*.pth
+!model_mbv3_iou_mix_2C049.pth
+*.onnx
+!web_app/document_detector.onnx
+
+# Logs
+*.log
+
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+
+# Test files
+test_*.py
+*_test.py
@@ -0,0 +1,81 @@
+# Document Detection Application
+
+A deep learning-based document boundary detection application using DeepLabV3 semantic segmentation with MobileNetV3-Large backbone. Available as both a desktop GUI (PySide6) and a web application (pure JavaScript with ONNX Runtime).
+
+## Features
+
+### Desktop Application (Python + PySide6)
+- 📁 **Image Upload**: Load and process images from your file system
+- 📷 **Real-time Webcam**: Live document detection from webcam feed
+- 🎯 **Accurate Detection**: DeepLabV3 with MobileNetV3-Large backbone
+- 📊 **Performance Metrics**: Real-time display of preprocessing, inference, and post-processing times
+- 💾 **Export Results**: Save overlay, mask, and cropped document images
+- 🖼️ **Interactive Viewer**: Zoom, pan, and fit-to-window controls
+
+### Web Application (Pure JavaScript)
+- 🌐 **Browser-based**: Runs entirely in the browser using ONNX Runtime Web
+- ⚡ **No Backend Required**: All inference happens client-side via WebAssembly
+- 📱 **Responsive Design**: Works on desktop and mobile devices
+- 🎨 **Modern UI**: Clean, premium interface with real-time metrics
+- 📸 **Webcam Support**: Real-time document detection in the browser
+
+## Model Performance
+
+**MobileNetV3-Large Backbone:**
+- **Parameters**: 11,020,594
+- **Input Size**: 384×384
+- **Inference Time** (CPU): ~180-210ms
+- **FPS**: ~4-5 (CPU), ~30+ (GPU)
+
+## Installation
+
+### Prerequisites
+- Python 3.8 or higher
+- pip package manager
+
+### Installation
+
+```bash
+pip install -r requirements.txt
+```
+
+
+## Usage
+
+### Desktop Application
+
+Run the desktop GUI application:
+
+```bash
+python main.py
+```
+
+**Controls:**
+- **Load Image**: Open an image file for processing
+- **Start Webcam**: Enable real-time webcam detection
+- **Process Image**: Manually process the current image
+- **Export Results**: Save processed images to disk
+- **Clear**: Reset the application state
+
+### Web Application
+
+1. **Generate ONNX model** (first time only):
+    ```bash
+    python export_onnx.py
+    ```
+
+2. **Start the web server**:
+    ```bash
+    cd web_app
+    python -m http.server
+    ```
+
+3. **Open in browser**: Navigate to `http://localhost:8000`
+
+The web app will load the ONNX model and run inference entirely in your browser using WebAssembly.
+
+## Credits
+
+This project is based on the LearnOpenCV article:
+[Deep Learning Based Document Segmentation Using Semantic Segmentation DeepLabV3 on Custom Dataset](https://learnopencv.com/deep-learning-based-document-segmentation-using-semantic-segmentation-deeplabv3-on-custom-dataset/) and [sampe code](https://github.com/spmallick/learnopencv/tree/master/Document-Scanner-Custom-Semantic-Segmentation-using-PyTorch-DeepLabV3).
+
@@ -0,0 +1,63 @@
+"""
+Configuration file for Document Detection Application
+"""
+import os
+from pathlib import Path
+
+# Paths
+BASE_DIR = Path(__file__).parent
+OUTPUT_DIR = BASE_DIR / "output"
+
+# Model Configuration
+MODELS = {
+    'MobileNetV3-Large': {
+        'path': BASE_DIR / "model_mbv3_iou_mix_2C049.pth",
+        'backbone': 'mbv3',
+        'description': 'DeepLabV3 with MobileNetV3-Large backbone'
+    }
+}
+
+DEFAULT_MODEL = 'MobileNetV3-Large'
+
+MODEL_COMMON_CONFIG = {
+    'num_classes': 2,     # Background and Document
+    'input_size': (384, 384),  # As used in training
+}
+
+# Image Preprocessing
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+
+# Device Configuration
+DEVICE_PREFERENCE = 'cpu'  # 'cuda' or 'cpu'
+
+# UI Configuration
+WINDOW_CONFIG = {
+    'title': 'Document Detection - DeepLabV3',
+    'width': 1400,
+    'height': 900,
+    'min_width': 1000,
+    'min_height': 700,
+}
+
+# Supported Image Formats
+SUPPORTED_FORMATS = [
+    '*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.tif'
+]
+
+# Visualization Colors (BGR format for OpenCV)
+COLORS = {
+    'document_mask': (0, 255, 0),      # Green
+    'boundary': (0, 0, 255),           # Red
+    'corners': (255, 0, 0),            # Blue
+    'text': (255, 255, 255),           # White
+}
+
+# Performance Settings
+PERFORMANCE_CONFIG = {
+    'history_size': 50,  # Number of frames to keep in metrics history
+    'webcam_fps': 30,
+}
+
+# Create output directory if it doesn't exist
+OUTPUT_DIR.mkdir(exist_ok=True)
@@ -0,0 +1,57 @@
+"""
+Script to export PyTorch model to ONNX format for web deployment
+"""
+import torch
+import torch.onnx
+from model_loader import load_model
+import config
+import sys
+from pathlib import Path
+
+def export_to_onnx(model_name="MobileNetV3-Large", output_name="document_detector.onnx"):
+    print(f"Loading model: {model_name}...")
+    
+    # Load the model
+    try:
+        # Force CPU for export to avoid any CUDA dependencies in the exported graph
+        model, _ = load_model(model_name, device='cpu')
+        model.eval()
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return False
+
+    # Create dummy input
+    # Shape: (Batch_Size, Channels, Height, Width)
+    # We use 1 for batch size, 3 for RGB channels, and 384x384 as input size
+    dummy_input = torch.randn(1, 3, 384, 384, requires_grad=True)
+
+    output_path = config.BASE_DIR / "web_app" / output_name
+    output_path.parent.mkdir(exist_ok=True)
+
+    print(f"Exporting to {output_path}...")
+
+    # Export
+    try:
+        torch.onnx.export(
+            model,                      # model being run
+            dummy_input,                # model input (or a tuple for multiple inputs)
+            str(output_path),           # where to save the model
+            export_params=True,         # store the trained parameter weights inside the model file
+            opset_version=12,           # the ONNX version to export the model to
+            do_constant_folding=True,   # whether to execute constant folding for optimization
+            input_names=['input'],      # the model's input names
+            output_names=['output'],    # the model's output names
+            dynamic_axes={              # variable length axes
+                'input': {0: 'batch_size'},
+                'output': {0: 'batch_size'}
+            }
+        )
+        print("✅ Export successful!")
+        return True
+    except Exception as e:
+        print(f"❌ Export failed: {e}")
+        return False
+
+if __name__ == "__main__":
+    success = export_to_onnx()
+    sys.exit(0 if success else 1)
@@ -0,0 +1,110 @@
+"""
+Custom image viewer widget with zoom and pan capabilities
+"""
+from PySide6.QtWidgets import QGraphicsView, QGraphicsScene, QGraphicsPixmapItem
+from PySide6.QtCore import Qt, Signal, QPointF
+from PySide6.QtGui import QPixmap, QWheelEvent, QMouseEvent
+import numpy as np
+from utils import numpy_to_qpixmap
+
+class ImageViewer(QGraphicsView):
+    """Custom image viewer with zoom and pan"""
+    
+    imageClicked = Signal(QPointF)
+    
+    def __init__(self, parent=None):
+        super().__init__(parent)
+        
+        self.scene = QGraphicsScene(self)
+        self.setScene(self.scene)
+        
+        self.pixmap_item = None
+        self.zoom_factor = 1.0
+        self.min_zoom = 0.1
+        self.max_zoom = 10.0
+        
+        # Enable dragging
+        self.setDragMode(QGraphicsView.ScrollHandDrag)
+        
+        # Set rendering hints
+        self.setRenderHint(self.renderHints())
+        
+        # Disable scrollbars (we'll use them when needed)
+        self.setHorizontalScrollBarPolicy(Qt.ScrollBarAsNeeded)
+        self.setVerticalScrollBarPolicy(Qt.ScrollBarAsNeeded)
+        
+        # Background
+        self.setBackgroundBrush(Qt.darkGray)
+    
+    def set_image(self, image: np.ndarray):
+        """
+        Set image to display
+        
+        Args:
+            image: Numpy array image
+        """
+        # Clear scene
+        self.scene.clear()
+        
+        # Convert to pixmap
+        pixmap = numpy_to_qpixmap(image)
+        
+        # Add to scene
+        self.pixmap_item = QGraphicsPixmapItem(pixmap)
+        self.scene.addItem(self.pixmap_item)
+        
+        # Reset zoom
+        self.zoom_factor = 1.0
+        self.fit_in_view()
+    
+    def set_pixmap(self, pixmap: QPixmap):
+        """Set QPixmap to display"""
+        self.scene.clear()
+        self.pixmap_item = QGraphicsPixmapItem(pixmap)
+        self.scene.addItem(self.pixmap_item)
+        self.zoom_factor = 1.0
+        self.fit_in_view()
+    
+    def fit_in_view(self):
+        """Fit image in view"""
+        if self.pixmap_item:
+            self.fitInView(self.pixmap_item, Qt.KeepAspectRatio)
+            self.zoom_factor = 1.0
+    
+    def zoom_in(self):
+        """Zoom in"""
+        self.scale_view(1.25)
+    
+    def zoom_out(self):
+        """Zoom out"""
+        self.scale_view(0.8)
+    
+    def scale_view(self, factor: float):
+        """Scale view by factor"""
+        new_zoom = self.zoom_factor * factor
+        
+        if new_zoom < self.min_zoom or new_zoom > self.max_zoom:
+            return
+        
+        self.scale(factor, factor)
+        self.zoom_factor = new_zoom
+    
+    def wheelEvent(self, event: QWheelEvent):
+        """Handle mouse wheel for zooming"""
+        if event.angleDelta().y() > 0:
+            self.scale_view(1.15)
+        else:
+            self.scale_view(0.85)
+    
+    def mousePressEvent(self, event: QMouseEvent):
+        """Handle mouse press"""
+        if event.button() == Qt.LeftButton:
+            scene_pos = self.mapToScene(event.pos())
+            self.imageClicked.emit(scene_pos)
+        super().mousePressEvent(event)
+    
+    def clear(self):
+        """Clear the viewer"""
+        self.scene.clear()
+        self.pixmap_item = None
+        self.zoom_factor = 1.0