Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ build/
*.so
.eggs/
demo_render/
captures/
CLAUDE.md
.claude/
.agents/
skyseg.onnx
skyseg.onnx
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ python demo.py --model_path /path/to/checkpoint.pt \
--video_path video.mp4 --fps 10
```

### Webcam Capture

The webcam mode records a short sequence from the local camera, saves the
frames under `captures/`, then runs the normal image-sequence reconstruction.
Move the camera slowly sideways or around the object so the model gets
parallax; rotating in place gives much weaker geometry.

```bash
python demo.py --model_path /path/to/checkpoint.pt \
--webcam --capture_frames 36 --capture_fps 3
```

### Streaming with Keyframe Interval

Use `--keyframe_interval` to reduce KV cache memory by only keeping every N-th frame as a keyframe. Non-keyframe frames still produce predictions but are not stored in the cache. This is useful for long sequences which exceed 320 frames (We train with video RoPE on 320 views, so performance degrades when the KV cache stores more than 320 views. Using a keyframe strategy allows inference over longer sequences.).
Expand Down
149 changes: 147 additions & 2 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import glob
import os
import time
from datetime import datetime

# Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated
# memory gap by letting the caching allocator grow segments on demand instead of
Expand All @@ -42,6 +43,112 @@
# Image loading
# =============================================================================

def capture_webcam_frames(
camera_index=0,
capture_dir=None,
num_frames=48,
fps=4,
width=1280,
height=720,
warmup_frames=12,
preview=True,
mirror_preview=True,
):
"""Capture a finite webcam sequence to an image folder."""
if capture_dir is None:
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
capture_dir = os.path.join("captures", f"webcam_{stamp}")
os.makedirs(capture_dir, exist_ok=True)

backends = []
if hasattr(cv2, "CAP_AVFOUNDATION"):
backends.append(cv2.CAP_AVFOUNDATION)
backends.append(None)

cap = None
for backend in backends:
candidate = (
cv2.VideoCapture(camera_index, backend)
if backend is not None
else cv2.VideoCapture(camera_index)
)
if candidate.isOpened():
cap = candidate
break
candidate.release()

if cap is None:
raise RuntimeError(
f"Could not open webcam index {camera_index}. On macOS, allow camera "
"access for the terminal or app that launches this script."
)

cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
cap.set(cv2.CAP_PROP_FPS, fps)

for _ in range(max(0, warmup_frames)):
cap.read()

print(
f"Capturing {num_frames} webcam frames at ~{fps} FPS "
f"to {capture_dir}. Move the camera slowly for parallax."
)
if preview:
print("Press q or Esc in the preview window to stop early.")

saved_paths = []
next_capture_time = time.monotonic()
capture_interval = 1.0 / max(fps, 0.1)

try:
while len(saved_paths) < num_frames:
ok, frame = cap.read()
if not ok:
time.sleep(0.02)
continue

now = time.monotonic()
if now >= next_capture_time:
out_path = os.path.join(capture_dir, f"{len(saved_paths):06d}.jpg")
cv2.imwrite(out_path, frame)
saved_paths.append(out_path)
next_capture_time = now + capture_interval

if preview:
preview_frame = cv2.flip(frame, 1) if mirror_preview else frame.copy()
status = f"{len(saved_paths)}/{num_frames} frames"
cv2.putText(
preview_frame,
status,
(20, 40),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(0, 255, 0),
2,
cv2.LINE_AA,
)
cv2.imshow("LingBot-Map webcam capture", preview_frame)
key = cv2.waitKey(1) & 0xFF
if key in (ord("q"), 27):
break
else:
time.sleep(0.005)
finally:
cap.release()
if preview:
cv2.destroyWindow("LingBot-Map webcam capture")

if len(saved_paths) < 2:
raise RuntimeError(
f"Only captured {len(saved_paths)} frame(s); at least 2 are required "
"for reconstruction."
)

print(f"Captured {len(saved_paths)} frames.")
return capture_dir


def load_images(image_folder=None, video_path=None, fps=10, image_ext=".jpg,.png",
first_k=None, stride=1, image_size=518, patch_size=14, num_workers=8):
"""Load images from folder or video and preprocess into a tensor.
Expand Down Expand Up @@ -237,6 +344,26 @@ def main():
# Input
parser.add_argument("--image_folder", type=str, default=None)
parser.add_argument("--video_path", type=str, default=None)
parser.add_argument("--webcam", action="store_true",
help="Capture a short sequence from the local webcam, then reconstruct it")
parser.add_argument("--camera_index", type=int, default=0,
help="Webcam index for --webcam. The built-in camera is usually 0.")
parser.add_argument("--capture_dir", type=str, default=None,
help="Folder for saved webcam frames (default: captures/webcam_<timestamp>/)")
parser.add_argument("--capture_frames", type=int, default=48,
help="Number of webcam frames to save before reconstruction")
parser.add_argument("--capture_fps", type=float, default=4.0,
help="Sampling FPS for webcam capture")
parser.add_argument("--camera_width", type=int, default=1280,
help="Requested webcam capture width")
parser.add_argument("--camera_height", type=int, default=720,
help="Requested webcam capture height")
parser.add_argument("--warmup_frames", type=int, default=12,
help="Webcam frames to discard before saving")
parser.add_argument("--preview", action=argparse.BooleanOptionalAction, default=True,
help="Show a local webcam preview while capturing")
parser.add_argument("--mirror_preview", action=argparse.BooleanOptionalAction, default=True,
help="Mirror the preview only; saved frames stay unmirrored")
parser.add_argument("--fps", type=int, default=10)
parser.add_argument("--first_k", type=int, default=None)
parser.add_argument("--stride", type=int, default=1)
Expand Down Expand Up @@ -292,11 +419,29 @@ def main():
help="Export stride-sampled, resized/cropped images to this folder")

args = parser.parse_args()
assert args.image_folder or args.video_path, \
"Provide --image_folder or --video_path"
input_count = sum([
args.image_folder is not None,
args.video_path is not None,
args.webcam,
])
assert input_count == 1, \
"Provide exactly one input: --image_folder, --video_path, or --webcam"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if args.webcam:
args.image_folder = capture_webcam_frames(
camera_index=args.camera_index,
capture_dir=args.capture_dir,
num_frames=args.capture_frames,
fps=args.capture_fps,
width=args.camera_width,
height=args.camera_height,
warmup_frames=args.warmup_frames,
preview=args.preview,
mirror_preview=args.mirror_preview,
)

# ── Load images & model ──────────────────────────────────────────────────
t0 = time.time()
images, paths, resolved_image_folder = load_images(
Expand Down