panorama-from-scratch

panorama-from-scratch


Jupyter Notebook
Last updated on

Making a panorama from scratch

Nowadays, our phone cameras are well calibrated, with the most advanced feature detection and matching techniques inbuilt so we can make panoramas without needing to think. But thinking is fun, so why not take a walk back in time to when panorama making was a multi step puzzle using SIFT, RANSAC & Projective Transformations

from pathlib import Path

import cv2 as cv
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

%matplotlib inline

images = [cv.imread(path) for path in Path("./images").glob("*.png")]


def show_image(img: np.ndarray) -> None:
    img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)

    # Display the image using Matplotlib
    plt.imshow(img_rgb)
    plt.axis("off")  # Hide axes
    plt.show()


def find_im_corners(frame_shape: tuple[int, int], H: np.ndarray) -> np.ndarray:
    h1, w1, _ = frame_shape
    original_size = h1 * w1

    im1_corners = np.float32([[0, 0], [0, h1], [w1, h1], [w1, 0]]).reshape(-1, 1, 2)
    corner_locations = cv.perspectiveTransform(im1_corners, H)
    (xmin, ymin) = np.int32(np.floor(corner_locations.min(axis=0).ravel()))
    (xmax, ymax) = np.int32(np.ceil(corner_locations.max(axis=0).ravel()))
    im_size = (xmax - xmin) * (ymax - ymin)
    if abs(im_size) < original_size * 10:
        return corner_locations
    return im1_corners


for image in images:
    show_image(image)

png

png

png

png

Calculate key points and descriptors with SIFT (Scale Invariant Feature Transform)

frame_bw = [cv.cvtColor(im, cv.COLOR_BGR2GRAY) for im in images]
sift = cv.SIFT_create()

# sift.detectAndCompute returns a tuple of keypoints & descriptors
sift_features = [sift.detectAndCompute(im, None) for im in tqdm(frame_bw)]
100%|██████████| 4/4 [00:08<00:00,  2.05s/it]

Match Features Between Frames using FLANN (Fast Library for Approximate Nearest Neighbours)

FLANN_INDEX_KDTREE = 1
index_params = {"algorithm": FLANN_INDEX_KDTREE, "trees": 5}
search_params = {"checks": 50}

threshold = 0.55  # How much difference between features we will accept

flann = cv.FlannBasedMatcher(index_params, search_params)


def match_features(
    descriptor1: np.ndarray, descriptor2: np.ndarray
) -> list[list[cv.DMatch]]:
    results = flann.knnMatch(descriptor1, descriptor2, k=2)
    distance_mat = np.array([[n1.distance, n2.distance] for n1, n2 in results])
    match_mat = distance_mat[:, 0] < threshold * distance_mat[:, 1]
    # Bin all good matches and find the image which has the most matches
    return [[result[0]] for i, result in enumerate(results) if match_mat[i]]


matches = [
    match_features(sift_features[i][1], sift_features[i + 1][1])
    for i in tqdm(range(len(images) - 1))
]

100%|██████████| 3/3 [00:10<00:00,  3.53s/it]
img3 = cv.drawMatchesKnn(
    images[0],
    sift_features[0][0],
    images[1],
    sift_features[1][0],
    matches[0],
    None,
    flags=cv.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
)

show_image(img3)

png

Find the homography that projects one image onto the other using RANSAC (RANdom SAmple Consensus)

i = 0

homography_set = []

for i in tqdm(range(len(matches))):
    key_points1 = sift_features[i][0]
    key_points2 = sift_features[i + 1][0]

    src_points = np.float32(
        [key_points1[m[0].queryIdx].pt for m in matches[i]]
    ).reshape(-1, 1, 2)
    dst_points = np.float32(
        [key_points2[m[0].trainIdx].pt for m in matches[i]]
    ).reshape(-1, 1, 2)

    homography, _ = cv.findHomography(src_points, dst_points, cv.RANSAC, 5.0)
    homography_set.append({"H": homography, "idx": i})
100%|██████████| 3/3 [00:00<00:00, 85.79it/s]

This part is just to make the final panorama nicer

Right now, all the homographies stitch one image to the following image, we can use these one after another to make a panorama, but this will create a weird perspective where all frames are projected to the perspective of the last image. Instead, we can do some simple projection to make all the homographies project to the second or third image, which will look nicer.

# I will project all to the POV of picture 3

H_13 = homography_set[0]["H"] @ homography_set[1]["H"]
H_23 = homography_set[1]["H"]
H_33 = np.eye(3)
H_43 = np.linalg.inv(homography_set[2]["H"])  # The inverse of H_34 is H_43

homography_set = [
    {"H": H_13, "idx": 0},
    {"H": H_23, "idx": 1},
    {"H": H_33, "idx": 2},
    {"H": H_43, "idx": 3},
]

More stuff to make the final result look nicer

By default we would be projecting other images onto image 3, but the image will still be the same size as image 3 unless we do something to change that. To see the whole panorama, we can find where the corners of each image get projected to, and get the maximum and minimum values for each corner. We can then create a translation matrix we can use to project our homographies to this new image size.

all_corners = [find_im_corners(images[0].shape, item["H"]) for item in homography_set]
all_points = np.concatenate(all_corners, axis=0)
(xmin, ymin) = np.int32(np.floor(all_points.min(axis=0).ravel()))
(xmax, ymax) = np.int32(np.ceil(all_points.max(axis=0).ravel()))

translation = np.array([[1, 0, -xmin], [0, 1, -ymin], [0, 0, 1]])

visual_homographies = [
    {"idx": item["idx"], "H": translation @ item["H"]} for item in homography_set
]

Now we can project our images!

centered_frames = [
    cv.warpPerspective(images[item["idx"]], item["H"], (xmax - xmin, ymax - ymin))
    for item in visual_homographies
]

for img in centered_frames:
    show_image(img)

png

png

png

png

Lastly, we need to combine our frames, and blend them to make it more visually appealing

We can do this by overlaying all of our images on eachother because they are all the same image size now. Then we can take the average pixel intensity whenever there are overlapping pixels (There are other ways of blending, but this is easiest).

def blend_all_images(image_list: list[np.ndarray]) -> np.ndarray:
    erosion_kernel = np.ones((5, 5), np.uint8)
    image_masks = [
        cv.erode(
            cv.threshold(cv.cvtColor(image, cv.COLOR_BGR2GRAY), 1, 1, cv.THRESH_BINARY)[
                1
            ],
            erosion_kernel,
        )
        for image in image_list
    ]
    weighting_mask = np.sum(image_masks, axis=0)
    weighting_mask[weighting_mask == 0] = 1  # Avoid zero divide

    blended_image = np.zeros_like(image_list[0], dtype=np.float32)

    for i, mask in tqdm(enumerate(image_masks)):
        weighted_mask = mask / weighting_mask
        mask_3_channel = np.stack([weighted_mask, weighted_mask, weighted_mask], axis=2)
        blended_image += mask_3_channel * image_list[i].astype(np.float32)

    return np.clip(blended_image, 0, 255).astype(np.uint8)
blended_panorama = blend_all_images(centered_frames)
show_image(blended_panorama)
4it [00:08,  2.15s/it]



png

Another simple approach is to just pick one pixel wherever there is an overlap

def blend_first_pixel(image_list: list[np.ndarray]) -> np.ndarray:
    blended_image = np.zeros_like(image_list[0])
    mask = np.zeros(image_list[0].shape[:2], dtype=bool)

    for img in image_list:
        img_mask = np.any(img != 0, axis=2)
        update_mask = img_mask & (~mask)
        for c in range(3):  # for each channel
            blended_image[..., c][update_mask] = img[..., c][update_mask]
        mask |= update_mask  # update mask to mark filled pixels

    return blended_image

selective_panorama = blend_first_pixel(centered_frames)
show_image(selective_panorama)

png

© 2025 Stephen Hallett