MiDaS
MiDaS models for computing relative depth from a single image.
MiDaS computes relative inverse depth from a single image. The repository provides multiple models that cover different use cases ranging from a small, high-speed model to a very large model that provide the highest accuracy. The models have been trained on 10 distinct datasets using multi-objective optimization to ensure high quality on a wide range of inputs.
Installation:
To install midas offline:
cd $DORA_DEP_HOME/dependencies/
git clone git@github.com:isl-org/MiDaS.git
cd MiDaS/weights
# If you don't want to add manual download, the program will also automatically download the model file
wget https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt
cp midas_v21_small_256.pt $HOME/.cache/torch/hub/checkpoints/
Inputs
- image: HEIGHT x WIDTH x BGR array.
Outputs
- bbox: HEIGHT x WIDTH x Relative depth array.
Example output

Add the following dataflow configuration
  - id: midas_op
    operator:
      outputs:
        - depth_frame
      inputs:
        image: webcam/image
      python: ../../operators/midas_op.py
    env:
      PYTORCH_DEVICE: "cuda"
      MIDAS_PATH: $DORA_DEP_HOME/dependencies/MiDaS/
      MIDAS_WEIGHT_PATH: $DORA_DEP_HOME/dependencies/MiDaS/weights/midas_v21_small_256.pt
      MODEL_TYPE: "MiDaS_small"
      MODEL_NAME: "MiDaS_small"
- model_type = "DPT_Large" # MiDaS v3 - Large (highest accuracy, slowest inference speed)
- model_type = "DPT_Hybrid" # MiDaS v3 - Hybrid (medium accuracy, medium inference speed)
- model_type = "MiDaS_small" # MiDaS v2.1 - Small (lowest accuracy, highest inference speed)
Methods
__init__()
Source Code
    def __init__(self):
        if MIDAS_PATH is None:
            # With internet
            self.model = torch.hub.load(
                "intel-isl/MiDaS",
                MODEL_TYPE,
            )
            midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
        else:
            # Without internet
            self.model = torch.hub.load(
                repo_or_dir=MIDAS_PATH,
                model=MODEL_NAME,
                weights=MIDAS_WEIGHT_PATH,
                source="local",
            )
            midas_transforms = torch.hub.load(
                repo_or_dir=MIDAS_PATH, model="transforms", source="local"
            )
        if MODEL_TYPE == "DPT_Large" or MODEL_TYPE == "DPT_Hybrid":
            self.transform = midas_transforms.dpt_transform
        else:
            self.transform = midas_transforms.small_transform
        self.model.to(torch.device(DEVICE))
        self.model.eval()
.on_event(...)
Source Code
    def on_event(
        self,
        dora_event: dict,
        send_output: Callable[[str, bytes], None],
    ) -> DoraStatus:
        if dora_event["type"] == "INPUT":
            return self.on_input(dora_event, send_output)
        return DoraStatus.CONTINUE
.on_input(...)
Handle image Args: dora_input["id"] (str): Id of the input declared in the yaml configuration dora_input["data"] (bytes): Bytes message of the input send_output (Callable[[str, bytes]]): Function enabling sending output back to dora.
Source Code
    def on_input(
        self,
        dora_input: dict,
        send_output: Callable[[str, bytes], None],
    ) -> DoraStatus:
        """Handle image
        Args:
            dora_input["id"]  (str): Id of the input declared in the yaml configuration
            dora_input["data"] (bytes): Bytes message of the input
            send_output (Callable[[str, bytes]]): Function enabling sending output back to dora.
        """
        if dora_input["id"] == "image":
            # Convert bytes to numpy array
            frame = np.frombuffer(
                dora_input["data"],
                np.uint8,
            ).reshape((IMAGE_HEIGHT, IMAGE_WIDTH, 4))
            with torch.no_grad():
                image = frame[:, :, :3]
                img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                input_batch = self.transform(img).to(DEVICE)
                prediction = self.model(input_batch)
                prediction = torch.nn.functional.interpolate(
                    prediction.unsqueeze(1),
                    size=img.shape[:2],
                    mode="bicubic",
                    align_corners=False,
                ).squeeze()
                depth_output = prediction.cpu().numpy()
                depth_min = depth_output.min()
                depth_max = depth_output.max()
                normalized_depth = (
                    255 * (depth_output - depth_min) / (depth_max - depth_min)
                )
                normalized_depth *= 3
                depth_frame = (
                    np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
                )
                depth_frame = cv2.applyColorMap(
                    np.uint8(depth_frame), cv2.COLORMAP_INFERNO
                )
                height, width = depth_frame.shape[:2]
                depth_frame_4 = np.dstack(
                    [depth_frame, np.ones((height, width), dtype="uint8") * 255]
                )
                send_output(
                    "depth_frame",
                    depth_frame_4.tobytes(),
                    dora_input["metadata"],
                )
        return DoraStatus.CONTINUE