Hi there, I'm Yibo Liu.
I'm facing some uncontrollable challenges right now and don’t have the mood to update this repo. If I can get through this, I will try to complete the repo. Torching scientists is one of the most self-destructive things a country can do to itself.
Although I am still fighting against witch hunting, I will try to provide updates to this repository before April.
Due to IP policies, we are unable to release a click-and-run version of HIPPo. However, I will provide sufficient details here to help you reproduce a HIPPo-like framework. In particular, the method does not require a pre-built 3D model and can perform 6D pose estimation directly from the first observation.
HIPPo is a training-free approach. In other words, you only need to download the pre-trained models to construct it.
| Model | Function | Link |
|---|---|---|
| Grounded SAM | Segmentation | Swint |
| Wonder3D | Multi-view Generation | GitHub |
| MASt3R | 3D Model Reconstruction | ViT-Large |
| FoundationPose | Object Tracking | FP |
| InstantMesh (Alternative) | Multi-view Generation+3D Model Reconstruction | GitHub |
Given an RGB-D image, the goal of this step is to segment the object using a mask generated by Grounded SAM based on the prompt provided by the user. You may follow this demo. Once you obtain the object mask, you need to apply the same mask to extract the depth image of the object as well.
# 1. Capture RGB-D frame
depth_img, color_img = self.cam.take_single_frame()
# 2. Detect objects from RGB image
detections = self.detector.detect_objects(color_img)
# If no object detected, skip this frame
if len(detections.class_id) == 0:
continue
# 3. Apply NMS to remove redundant detections
detections = self.detector.apply_nms(detections)
# 4. Segment detected objects (Grounded SAM)
detections = self.detector.segment_objects(color_img, detections)
# -------------------------------------------------
# 5. Extract mask (only use the first detected object)
# -------------------------------------------------
# Raw mask: (H, W) or (N, H, W)
mask_raw = detections.mask.squeeze().astype(np.uint8)
# If multiple masks exist, use the first one
if mask_raw.ndim > 2:
mask_raw = mask_raw[0]
# Binary mask for visualization (0 or 255)
mask_img = mask_raw * 255
# -------------------------------------------------
# 6. Apply mask to RGB image
# -------------------------------------------------
rgb_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2RGB)
masked_rgb_img = masked_img(rgb_img, mask_img)
# -------------------------------------------------
# 7. Apply mask to depth image
# -------------------------------------------------
masked_depth_img = depth_img * mask_raw
timestamp = f"{self.cam.frame_count:06d}"
# You do not need to save these images. These commands are only used to help you check the results at each step.
cv2.imwrite(f"/workspace/masked_img/{timestamp}.png", dst)
cv2.imwrite(f"/workspace/masks/{timestamp}.png", mask_img)
The goal of this step is to generate multi-view images.
bg_color = np.array([1., 1., 1.], dtype=np.float32)
cond, _, _ = load_image(
f"/workspace/masked_img/{timestamp}.png",
bg_color,
masked_depth_img
)
cond = Image.fromarray((cond * 255).astype(np.uint8)[:, :, :3])
cond = remove(cond)
cond.save(f"/workspace/croped_imgs/{timestamp}.png")
cv2.imwrite(f"/workspace/depth_img/{timestamp}.png", depth_img * 255)
cv2.imwrite(f"/workspace/masked_depth_img/{timestamp}.png", masked_depth_img * 255)
# Generate multi-view images
images = pipeline(cond, num_inference_steps=20,
output_type='pt', guidance_scale=1.0).images
result = make_grid(images, nrow=6, ncol=2, padding=0, value_range=(0, 1))
save_image(result, f"/workspace/wonder3d_results/{timestamp}.png")
# Take prediction half
bsz = images.shape[0] // 2
images_pred = images[bsz:]
# Create directories
mast3r_dir = f"/workspace/mast3r_inputs/{timestamp}"
mask_dir = f"/workspace/mask/{timestamp}"
os.makedirs(mast3r_dir, exist_ok=True)
os.makedirs(mask_dir, exist_ok=True)
# Save + remove background directly
for i in range(6):
img_np = (images_pred[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
img_np = remove(img_np)
cv2.imwrite(f"{mast3r_dir}/{i}.png", img_np)To run the following command
images = pipeline(
cond,
num_inference_steps=20,
output_type='pt',
guidance_scale=1.0
).imagesYou may follow the Usage of Wonder3D.
Now you have six views of the object. The next step is to generate a 3D model from them using MASt3R.
lr1 = 0.07
lr2 = 0.014
niter1 = 300
images = load_images(image_list, size=512)
pairs = make_pairs(images, prefilter=None, symmetrize=True)
scene = sparse_global_alignment(image_list, pairs, cache_dir,
model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device,
opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics,
matching_conf_thr=matching_conf_thr, depth_range=(filtered_mean)/1e3)
gradio_delete_cache=True
outdir = os.path.join(os.curdir, 'output')
os.makedirs(outdir, exist_ok=True)
outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir)
scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name)
silent=False
min_conf_thr = 2.0
as_pointcloud = False
mask_sky = False
clean_depth = True
transparent_cams = True
cam_size = 0
TSDF_thresh = 0
obj_path = f"/workspace/dust3r_outputs/{timestamp}.obj"
obj_pcd_path = f"/workspace/dust3r_outputs_pcd/{timestamp}.pcd"
outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky,
clean_depth, transparent_cams, cam_size, TSDF_thresh,obj_path,obj_pcd_path)
scene_my = scene_state.sparse_ga
rgbimg = to_numpy(scene_my.imgs)
focals = scene_my.get_focals()
poses = scene_my.get_im_poses()We modified the get_3D_model_from_scene to save the 3D model.
Also, remember to modify the mask generation logic in MASt3R. Otherwise, the background may be recognized as part of the object.
Again, it is recommended to first follow the MASt3R demo to create a script that generates a 3D model from multiple images, and then gradually add these modifications to improve the results.
The generate 3D model will be a 3D point cloud. Remember to apply the SOR filtering to denoise it.
import open3d as o3d
pcd = o3d.io.read_point_cloud("input.ply")
#SOR filtering. Just an example. The hyperparameters do not indicate recommended values.
pcd_filtered, ind = pcd.remove_statistical_outlier(
nb_neighbors=200, # KNN
std_ratio=2.0 # alpha
)
o3d.io.write_point_cloud("filtered.ply", pcd_filtered)Step 2 and Step 3 can be replaced by a one-stop solution: InstantMesh. Following this demo, you can obtain a 3D model directly and instantly. A potential issue is that Zero123++ only generates multi-view images with fixed observing poses, which may pose a challenge for subsequent scale recovery. Despite this limitation, we still highly recommend using InstantMesh, as our framework involves substantial engineering effort, whereas InstantMesh allows you to jump directly to Steps 4 and 5.
Since the generated 3D model does not have a meaningful physical scale, we need to recover the scale using real depth measurements.

from submodules.FoundationPose.Utils import depth2xyzmap
masked_depth_pc = depth2xyzmap(masked_depth_img,intrinsic_matrix)/1e3
masked_depth_pc = masked_depth_pc.reshape(-1, 3)
masked_depth_pc = masked_depth_pc[np.any(masked_depth_pc != [0, 0, 0], axis=1)]
pp = o3d.geometry.PointCloud()
pp.points = o3d.utility.Vector3dVector(masked_depth_pc)
o3d.io.write_point_cloud(f"/workspace/gt.pcd",pp)The measured point cloud of the object is noisy. We need to apply SOR filtering to denoise it as well.
Now that we have the denoised measured and estimated point clouds, we can compute a scale factor and use it to recover the physical size of the reference model.
import open3d as o3d
import numpy as np
def similarity_umeyama(A, B):
mu_A, mu_B = A.mean(0), B.mean(0)
AA, BB = A - mu_A, B - mu_B
U, S, Vt = np.linalg.svd(BB.T @ AA / len(A))
R = Vt.T @ U.T
if np.linalg.det(R) < 0:
Vt[-1] *= -1
R = Vt.T @ U.T
scale = S.sum() / np.var(BB, axis=0).sum()
t = mu_A - scale * R @ mu_B
T = np.eye(4)
T[:3,:3] = scale * R
T[:3,3] = t
return scale, T
source = o3d.io.read_point_cloud("estimated.ply")
target = o3d.io.read_point_cloud("measured.ply")
A = np.asarray(target.points)
B = np.asarray(source.points)
scale, T = similarity_umeyama(A, B)
print("scale:", scale)
source.transform(T)
reg = o3d.pipelines.registration.registration_icp(
source, target, 0.02, np.eye(4),
o3d.pipelines.registration.TransformationEstimationPointToPlane()
)
source.transform(reg.transformation)
o3d.visualization.draw_geometries([source, target])Then you may follow FoundationPose Demo to track the 6D pose of the object. The idea is quite straightforward. When using the vanilla FP, you need to define the reference model first. Simply locate where the reference model is defined and replace it with the result from the above steps.
anchor_mesh = trimesh.load(obj_path)
anchor_mesh = anchor_mesh.apply_scale(scale)
self.pose_estimator.mesh = anchor_mesh
self.pose_estimator.estimator.reset_object(
model_pts = anchor_mesh.vertices,
model_normals = anchor_mesh.vertex_normals,mesh=anchor_mesh)
self.pose_estimator.tracked = False
obj_pose6d, extents = self.pose_estimator.inference(color_img, depth_img,
mask_img, intrinsic_matrix)Once you complete Steps 1–5, you will have a framework capable of estimating the 6D pose from the first glance at the object. If you wish to update the mesh based on new observations, the following commands specify when the mesh should be updated.

def sample_views_icosphere(n_views, subdivisions=None, radius=1):
if subdivisions is not None:
mesh = trimesh.creation.icosphere(subdivisions=subdivisions, radius=radius)
else:
subdivision = 1
while 1:
mesh = trimesh.creation.icosphere(subdivisions=subdivision, radius=radius)
if mesh.vertices.shape[0]>=n_views:
break
subdivision += 1
cam_in_obs = np.tile(np.eye(4)[None], (len(mesh.vertices),1,1))
cam_in_obs[:,:3,3] = mesh.vertices
up = np.array([0,0,1])
z_axis = -cam_in_obs[:,:3,3] #(N,3)
z_axis /= np.linalg.norm(z_axis, axis=-1).reshape(-1,1)
x_axis = np.cross(up.reshape(1,3), z_axis)
invalid = (x_axis==0).all(axis=-1)
x_axis[invalid] = [1,0,0]
x_axis /= np.linalg.norm(x_axis, axis=-1).reshape(-1,1)
y_axis = np.cross(z_axis, x_axis)
y_axis /= np.linalg.norm(y_axis, axis=-1).reshape(-1,1)
cam_in_obs[:,:3,0] = x_axis
cam_in_obs[:,:3,1] = y_axis
cam_in_obs[:,:3,2] = z_axis
return cam_in_obs
def cartesian_to_spherical(cartesian_point):
x, y, z = cartesian_point
r = np.sqrt(x**2 + y**2 + z**2)
theta = np.arccos(z / r) # Polar angle
phi = np.arctan2(y, x) # Azimuthal angle
return theta, phi
def spherical_angle_difference(point1, point2):
# Convert Cartesian coordinates to spherical angles
theta1, phi1 = cartesian_to_spherical(point1)
theta2, phi2 = cartesian_to_spherical(point2)
# Calculate the angle difference using simple addition of deltas
delta_phi = abs(phi2 - phi1)
delta_theta = abs(theta2 - theta1)
# Sum of the angular differences
angle_distance = delta_phi + delta_theta
return angle_distance
def check_keyframe_icosphere(query_cam, occupied, tree, key_frame_points, angle_threshold=0.1):
'''
@query_cam: (4,4) np array
@occupied: dict
@tree: KDTree
@angle_threshold: float (in radians)
'''
query_cam = query_cam[:3, 3]
dist, idx = tree.query(query_cam.reshape(1, -1), k=1)
closest_point = key_frame_points[idx][0]
# Compute the angular distance
angle_distance = spherical_angle_difference(query_cam, closest_point)
if angle_distance < angle_threshold:
closest_point_tuple = tuple(closest_point)
if closest_point_tuple in occupied:
return False
else:
occupied[closest_point_tuple] = True
return True
else:
print(f"Angle distance: {angle_distance} exceeds threshold: {angle_threshold}. Ignored.")
return False
def generate_keyframe_icosphere(n_views, subdivisions=None, radius=1):
from scipy.spatial import KDTree
key_frame_points = sample_views_icosphere(n_views, subdivisions=subdivisions, radius=radius)[:,:3,3]
key_frame_pose = sample_views_icosphere(n_views, subdivisions=subdivisions, radius=radius)
occupied = {}
tree = KDTree(key_frame_points)
return tree, occupied, key_frame_points, key_frame_poseMesh updating can be formulated as an object-level SLAM problem. The key distinction is that, through HIPPo Dreamer, we have access to a 3D prior of the object model.
In our current implementation, we use a colored point cloud representation, and the update logic is based on KD-Tree nearest-neighbor search followed by point replacement.
def transfer_color_and_position(source_pcd, target_pcd):
source_points = np.asarray(source_pcd.points)
target_points = np.asarray(target_pcd.points)
target_colors = np.asarray(target_pcd.colors)
# Create a KDTree for the source point cloud
source_kdtree = o3d.geometry.KDTreeFlann(source_pcd)
# Iterate over each point in the target point cloud
for i, target_point in enumerate(target_points):
[_, idx, _] = source_kdtree.search_knn_vector_3d(target_point, 1) # 1 nearest neighbor
nearest_point_idx = idx[0]
# source_pcd.points[nearest_point_idx] =target_point # if geo is also wanted
source_pcd.colors[nearest_point_idx] =target_colors[i]
return source_pcdThis module can be replaced with more advanced alternatives, such as VGGT SLAM 2.0.
We would like to thank the authors for releasing the following projects:
FoundationPose
MASt3R
Wonder3D
InstantMesh
Parts of this project page were adopted from the Nerfies page.