Pose Traces

Guide and follower annotations are paired with their corresponding pose traces: a sequence of snapshots that capture the annotator’s virtual camera pose and field-of-view. The naming convention for these files is {instruction_id:06}_guide_pose_trace.npz for guide annotations and {demonstration_id:06}_follower_pose_trace.npz for follower annotations.

Data schema:

{'pano': (np.str, [n, 1]),
 'time': (np.float32, [n, 1]),
 'audio_time': (np.float32, [n, 1]),
 'extrinsic_matrix': (np.float32, [n, 16]),
 'intrinsic_matrix': (np.float32, [n, 16]),
 'image_mask': (np.bool, [k, 128, 256]),
 'text_masks': (np.bool, [k, m]),
 'feature_weights': (np.float32, [k, 36])}

Where n is the number of snapshots, k is the number of panoramic viewpoints in the associated path, and m is the number of BERT SubWord in the tokenized instructions.

Field descriptions:

pano: Panoramic viewpoint of the snapshot.
time: Timestamp of the snapshot in seconds.
audio_time: Timestamp corresponding to the follower’s progress listening to the guide’s instruction recording. This is only included in follower pose traces.
extrinsic_matrix: The extrinsic parameters, or pose, of the annotator’s camera.
intrinsic_matrix: The intrinsic parameters, or projection matrix, of the annotator’s camera.
image_mask: Mask indicating the pixels observed in the panorama. This mask is in equirectangular format, with heading angle 0 being the center of the image.
text_masks: Mask indicating the utterances that have been spoken or heard by the guide or follower, respectively, at this panoramic viewpoint.
feature_weights: An image_mask in feature space, corresponding to the typical setting in which 36 image features are generated at 12 heading and 3 elevation increments. Each value is a mean-pooled perspective projection of the image_mask for a particular heading and elevation.

Note that image_mask, text_mask, and feature_weights are provided solely for convenience, as they can be generated from the other pose trace fields and the timed_instruction.

Python program to print values

import numpy as np
 
file="./rxr-data/pose_traces/rxr_val_seen/104668_guide_pose_trace.npz"
 
data = np.load(file)
 
for i in range(0,len(data['pano'])):
    print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
data.close()

Interpreting the data

Dissecting the Camera Matrix, Part 2: The Extrinsic Matrix Dissecting the Camera Matrix, Part 3: The Intrinsic Matrix Wikipedia: Rotation Matrix Stackoverflow: Rotation Matrix to Euler Angles Stackoverflow: Rotation Matrix to Euler Angles Code

Matterport3D API

Requirements:

Width
Height
Vertical Field Of View
Horizontal Field Of View
ScanID
PanoramID
Heading
Elevation

 
import MatterSim
import time
import math
import cv2
import numpy as np
 
WIDTH = 1366
HEIGHT = 768
VFOV = math.radians(60)
HFOV = VFOV*WIDTH/HEIGHT
TEXT_COLOR = [230, 40, 40]
 
cv2.namedWindow('Python RGB')
cv2.namedWindow('Python Depth')
 
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(VFOV)
sim.setDepthEnabled(True) # Turn on depth only after running ./scripts/depth_to_skybox.py (see README.md)
sim.initialize()
#sim.newEpisode(['2t7WUuJeko7'], ['1e6b606b44df4a6086c0f97e826d4d15'], [0], [0])
#sim.newEpisode(['1LXtFkjw3qL'], ['0b22fa63d0f54a529c525afbf2e8bb25'], [0], [0])
#sim.newRandomEpisode(['i5noydFURQK'])
sim.newEpisode(['i5noydFURQK'], ['41f833ed92c0489bb85a911668189278'], [3.103316889455171], [0])
 
 
 
 
 
heading = 0
elevation = 0
location = 0
ANGLEDELTA = 5 * math.pi / 180
 
print('\nPython Demo')
print('Use arrow keys to move the camera.')
print('Use number keys (not numpad) to move to nearby viewpoints indicated in the RGB view.')
print('Depth outputs are turned off by default - check driver.py:L20 to enable.\n')
 
while True:
    import inspect; import pprint; obj = sim; pprint.pprint(inspect.getmembers(obj))
    print(sim)
    sim.makeAction([location], [heading], [elevation])
    location = 0
    heading = 0
    elevation = 0
 
    state = sim.getState()[0]
    locations = state.navigableLocations
    rgb = np.array(state.rgb, copy=False)
    #for idx, loc in enumerate(locations[1:]):
    #    # Draw actions on the screen
    #    fontScale = 3.0/loc.rel_distance
    #    x = int(WIDTH/2 + loc.rel_heading/HFOV*WIDTH)
    #    y = int(HEIGHT/2 - loc.rel_elevation/VFOV*HEIGHT)
    #    cv2.putText(rgb, str(idx + 1), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 
    #        fontScale, TEXT_COLOR, thickness=3)
    cv2.imshow('Python RGB', rgb)
 
    depth = np.array(state.depth, copy=False)
    cv2.imshow('Python Depth', depth)
    k = cv2.waitKey(1)
    if k == -1:
        continue
    else:
        k = (k & 255)
    if k == ord('q'):
        break
    elif ord('1') <= k <= ord('9'):
        location = k - ord('0')
        if location >= len(locations):
            location = 0
    elif k == 81 or k == ord('a'):
        heading = -ANGLEDELTA
    elif k == 82 or k == ord('w'):
        elevation = ANGLEDELTA
    elif k == 83 or k == ord('d'):
        heading = ANGLEDELTA
    elif k == 84 or k == ord('s'):
        elevation = -ANGLEDELTA

FINAL PROGRAM TO REPLAY DATA

 
import MatterSim
import time
import math
import cv2
import numpy as np
 
WIDTH = 1366
HEIGHT = 768
FOV = math.radians(60)
 
cv2.namedWindow('Python RGB')
 
 
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(FOV)
sim.initialize()
 
 
def render(scan_id: str, panoram_id: str, heading: float, elevation: float, time):
    sim.newEpisode([scan_id], [panoram_id], [heading], [elevation])
    state = sim.getState()[0]
    rgb = np.array(state.rgb, copy=False)
    cv2.imshow('Python RGB', rgb)
    if time:
        cv2.waitKey((int)(time*1000))
    else:
        cv2.waitKey()
 
import numpy as np
 
pi=3.1415926535
 
#file = sys.argv[1]
#print("File = ",file)
file="/root/mount/rxr-data/pose_traces/rxr_val_seen/000106_guide_pose_trace.npz"
 
data = np.load(file)
 
import pprint
#pprint.pprint(data['audio_time'])
 
for i in range(0,len(data['pano'])):
    print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
    position=data['extrinsic_matrix'][i][:3,3:]
    print("POSITION (x=",position[0][0],",y=",position[1][0],",z=",position[2][0],")")
    rotation_matrix=data['extrinsic_matrix'][i][:3,:3]
    print("ROTATION:")
    print(rotation_matrix)
 
    fx,fy,fz=rotation_matrix[0,2],rotation_matrix[1,2],rotation_matrix[2,2]
    heading=np.arctan2(-fx,fz)
    elevation=-np.arcsin(fy)
 
    scan_id = 'i5noydFURQK'
    panoram_id = data['pano'][i]
    delta_time = data['time'][i+1] - data['time'][i]
 
    print("Heading =",heading)
    print("Elevation =",elevation)
    render(scan_id, panoram_id, heading, elevation, delta_time)
    print()
    print()
 
 
 
 
data.close()

FINAL 2

 
import MatterSim
import time
import math
import cv2
import numpy as np
 
WIDTH = 1366
HEIGHT = 768
FOV = math.radians(60)
 
cv2.namedWindow('Python RGB')
 
 
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(FOV)
sim.initialize()
 
 
def render(scan_id: str, panoram_id: str, heading: float, elevation: float, time, vector):
    sim.newEpisode([scan_id], [panoram_id], [heading], [elevation])
    state = sim.getState()[0]
    rgb = np.array(state.rgb, copy=False)
 
 
    # Draw camera direction arrow
    # Define center
    center = (rgb.shape[1] // 2, rgb.shape[0] // 2)
    # Compute arrow tip
    arrow_scale = max(rgb.shape[1],rgb.shape[0])/10
    end = (center[0] + (int)(vector[0]*arrow_scale), center[1] + (int)(vector[1]*arrow_scale))
    # Draw the arrow
    cv2.arrowedLine(rgb, center, end, (0, 255, 0), 2, tipLength=0.2)
 
    # Draw movement direction arrow
    # Define center
    center = (rgb.shape[1] // 2, rgb.shape[0]*9//10)
    # Compute arrow tip
    arrow_scale = max(rgb.shape[1],rgb.shape[0])/10
    vector2 = (np.sin(vector[0]), -np.cos(vector[0]))
    end = (center[0] + (int)(vector2[0]*arrow_scale), (center[1] + (int)(vector2[1]*arrow_scale)))
    # Draw the arrow
    shift=((end[0]-center[0])//2,(end[1]-center[1])//2)
    center=(center[0]-shift[0], center[1]-shift[1])
    end=(end[0]-shift[0], end[1]-shift[1])
    cv2.arrowedLine(rgb, center, end, (0, 255, 255), 2, tipLength=0.2)
 
 
    cv2.imshow('Python RGB', rgb)
    if time:
        cv2.waitKey((int)(time*1000))
    else:
        cv2.waitKey()
 
import numpy as np
 
pi=3.1415926535
 
#file = sys.argv[1]
#print("File = ",file)
#file="/root/mount/rxr-data/pose_traces/rxr_val_seen/000106_guide_pose_trace.npz"
file="/root/mount/Matterport3DSimulator/000106_guide_pose_trace.npz"
 
data = np.load(file)
 
import pprint
#pprint.pprint(data['audio_time'])
 
 
# Get the next position of the camera
# TODO: inefficient, check for bottlenecks later
 
next_panorama_position = {}
for i in range(0,len(data['pano'])):
    k = len(data['pano'])-i-1
    position=data['extrinsic_matrix'][k][:3,3:]
    x, y, z = position[0][0],position[1][0],position[2][0]
    if(i==0):
        cur_panoram = data['pano'][k]
        cur_position = {"x": x, "y": y, "z": z}
 
        prev_panoram = data['pano'][k]
        prev_position = {"x": x, "y": y, "z": z}
    
    immediate_panoram = data['pano'][k]
    if( immediate_panoram != cur_panoram):
        prev_panoram = cur_panoram
        prev_position = cur_position
        cur_panoram = immediate_panoram
        cur_position = {"x": x, "y": y, "z": z}
    
    next_panorama_position[k] = prev_position
 
 
 
 
for i in range(0,len(data['pano'])):
    #print("\033[1;1H\033[47;30m")
    print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
    position=data['extrinsic_matrix'][i][:3,3:]
    x, y, z = position[0][0],position[1][0],position[2][0]
    print("POSITION (x=",x, ",y=",y,",z=",z,")")
    nx, ny, nz = next_panorama_position[i]["x"],next_panorama_position[i]["y"],next_panorama_position[i]["z"]
    print("NEXT POSITION (x=",nx, ",y=",ny,",z=",nz,")")
    dx,dy,dz = nx-x,ny-y,nz-z
    print("DELTA POSITION (x=",dx, ",y=",dy,",z=",dz,")")
    nd = (dx**2+dy**2+dz**2)**(1/2)
    dx/=nd
    dy/=nd
    dz/=nd
 
    print("DELTA POSITION Normalized (x=",dx, ",y=",dy,",z=",dz,")")
    rotation_matrix=data['extrinsic_matrix'][i][:3,:3]
    print("ROTATION:")
    print(rotation_matrix)
 
    fx,fy,fz=rotation_matrix[0,2],rotation_matrix[1,2],rotation_matrix[2,2]
    print("CAMERA FORWARD VECTOR (x=",-fx, ",y=",-fy,",z=",-fz,")")
    ## Camera vector is inverted??? (negative)
    #px,py,pz = dx+fx,dy+fy,dz+fz
    #delta_size=(px**2+py**2+pz**2)**(1/2)
    #px/=delta_size
    #py/=delta_size
    #pz/=delta_size
    #print("NEXT DIRECTION VECTOR (x=",px, ",y=",py,",z=",pz,")")
 
 
    heading=np.arctan2(-fx,fz)
    elevation=-np.arcsin(fy)
    print("Heading =",heading)
    print("Elevation =",elevation)
 
    scan_id = 'i5noydFURQK'
    panoram_id = data['pano'][i]
    delta_time = data['time'][i+1] - data['time'][i]
 
 
 
    heading_nd=np.arctan2(dx,-dz)
    elevation_nd=-np.arcsin(-dy)
    print("Heading nd=",heading_nd)
    print("Elevation nd=",elevation_nd)
 
    pi=3.1415926535
 
    def fix_angle(a):
        while a > pi:
            a-=2*pi
        while a < -pi:
            a+=2*pi
        return a
 
 
    px2d = fix_angle(heading_nd - heading)
    py2d = fix_angle(-(elevation_nd - elevation))
    render(scan_id, panoram_id, heading, elevation, delta_time, (px2d,py2d))
    print()
    #print("\033[m")
    print("\033[2J\033[1;1H")
 
 
 
 
data.close()

Gustavo Moura's blog

AKCIT RL Getting Images From RxR Matterport3D Dataset