Pose Traces

Guide and follower annotations are paired with their corresponding pose traces: a sequence of snapshots that capture the annotator’s virtual camera pose and field-of-view. The naming convention for these files is {instruction_id:06}_guide_pose_trace.npz for guide annotations and {demonstration_id:06}_follower_pose_trace.npz for follower annotations.

Data schema:

{'pano': (np.str, [n, 1]),
 'time': (np.float32, [n, 1]),
 'audio_time': (np.float32, [n, 1]),
 'extrinsic_matrix': (np.float32, [n, 16]),
 'intrinsic_matrix': (np.float32, [n, 16]),
 'image_mask': (np.bool, [k, 128, 256]),
 'text_masks': (np.bool, [k, m]),
 'feature_weights': (np.float32, [k, 36])}

Where n is the number of snapshots, k is the number of panoramic viewpoints in the associated path, and m is the number of BERT SubWord in the tokenized instructions.

Field descriptions:

  • pano: Panoramic viewpoint of the snapshot.
  • time: Timestamp of the snapshot in seconds.
  • audio_time: Timestamp corresponding to the follower’s progress listening to the guide’s instruction recording. This is only included in follower pose traces.
  • extrinsic_matrix: The extrinsic parameters, or pose, of the annotator’s camera.
  • intrinsic_matrix: The intrinsic parameters, or projection matrix, of the annotator’s camera.
  • image_mask: Mask indicating the pixels observed in the panorama. This mask is in equirectangular format, with heading angle 0 being the center of the image.
  • text_masks: Mask indicating the utterances that have been spoken or heard by the guide or follower, respectively, at this panoramic viewpoint.
  • feature_weights: An image_mask in feature space, corresponding to the typical setting in which 36 image features are generated at 12 heading and 3 elevation increments. Each value is a mean-pooled perspective projection of the image_mask for a particular heading and elevation.

Note that image_mask, text_mask, and feature_weights are provided solely for convenience, as they can be generated from the other pose trace fields and the timed_instruction.

Python program to print values
import numpy as np
 
file="./rxr-data/pose_traces/rxr_val_seen/104668_guide_pose_trace.npz"
 
data = np.load(file)
 
for i in range(0,len(data['pano'])):
    print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
data.close()
 

Interpreting the data

Dissecting the Camera Matrix, Part 2: The Extrinsic Matrix Dissecting the Camera Matrix, Part 3: The Intrinsic Matrix Wikipedia: Rotation Matrix Stackoverflow: Rotation Matrix to Euler Angles Stackoverflow: Rotation Matrix to Euler Angles Code

Matterport3D API

Requirements:
  • Width
  • Height
  • Vertical Field Of View
  • Horizontal Field Of View
  • ScanID
  • PanoramID
  • Heading
  • Elevation
 
import MatterSim
import time
import math
import cv2
import numpy as np
 
WIDTH = 1366
HEIGHT = 768
VFOV = math.radians(60)
HFOV = VFOV*WIDTH/HEIGHT
TEXT_COLOR = [230, 40, 40]
 
cv2.namedWindow('Python RGB')
cv2.namedWindow('Python Depth')
 
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(VFOV)
sim.setDepthEnabled(True) # Turn on depth only after running ./scripts/depth_to_skybox.py (see README.md)
sim.initialize()
#sim.newEpisode(['2t7WUuJeko7'], ['1e6b606b44df4a6086c0f97e826d4d15'], [0], [0])
#sim.newEpisode(['1LXtFkjw3qL'], ['0b22fa63d0f54a529c525afbf2e8bb25'], [0], [0])
#sim.newRandomEpisode(['i5noydFURQK'])
sim.newEpisode(['i5noydFURQK'], ['41f833ed92c0489bb85a911668189278'], [3.103316889455171], [0])
 
 
 
 
 
heading = 0
elevation = 0
location = 0
ANGLEDELTA = 5 * math.pi / 180
 
print('\nPython Demo')
print('Use arrow keys to move the camera.')
print('Use number keys (not numpad) to move to nearby viewpoints indicated in the RGB view.')
print('Depth outputs are turned off by default - check driver.py:L20 to enable.\n')
 
while True:
    import inspect; import pprint; obj = sim; pprint.pprint(inspect.getmembers(obj))
    print(sim)
    sim.makeAction([location], [heading], [elevation])
    location = 0
    heading = 0
    elevation = 0
 
    state = sim.getState()[0]
    locations = state.navigableLocations
    rgb = np.array(state.rgb, copy=False)
    #for idx, loc in enumerate(locations[1:]):
    #    # Draw actions on the screen
    #    fontScale = 3.0/loc.rel_distance
    #    x = int(WIDTH/2 + loc.rel_heading/HFOV*WIDTH)
    #    y = int(HEIGHT/2 - loc.rel_elevation/VFOV*HEIGHT)
    #    cv2.putText(rgb, str(idx + 1), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 
    #        fontScale, TEXT_COLOR, thickness=3)
    cv2.imshow('Python RGB', rgb)
 
    depth = np.array(state.depth, copy=False)
    cv2.imshow('Python Depth', depth)
    k = cv2.waitKey(1)
    if k == -1:
        continue
    else:
        k = (k & 255)
    if k == ord('q'):
        break
    elif ord('1') <= k <= ord('9'):
        location = k - ord('0')
        if location >= len(locations):
            location = 0
    elif k == 81 or k == ord('a'):
        heading = -ANGLEDELTA
    elif k == 82 or k == ord('w'):
        elevation = ANGLEDELTA
    elif k == 83 or k == ord('d'):
        heading = ANGLEDELTA
    elif k == 84 or k == ord('s'):
        elevation = -ANGLEDELTA
 

FINAL PROGRAM TO REPLAY DATA

 
import MatterSim
import time
import math
import cv2
import numpy as np
 
WIDTH = 1366
HEIGHT = 768
FOV = math.radians(60)
 
cv2.namedWindow('Python RGB')
 
 
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(FOV)
sim.initialize()
 
 
def render(scan_id: str, panoram_id: str, heading: float, elevation: float, time):
    sim.newEpisode([scan_id], [panoram_id], [heading], [elevation])
    state = sim.getState()[0]
    rgb = np.array(state.rgb, copy=False)
    cv2.imshow('Python RGB', rgb)
    if time:
        cv2.waitKey((int)(time*1000))
    else:
        cv2.waitKey()
 
import numpy as np
 
pi=3.1415926535
 
#file = sys.argv[1]
#print("File = ",file)
file="/root/mount/rxr-data/pose_traces/rxr_val_seen/000106_guide_pose_trace.npz"
 
data = np.load(file)
 
import pprint
#pprint.pprint(data['audio_time'])
 
for i in range(0,len(data['pano'])):
    print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
    position=data['extrinsic_matrix'][i][:3,3:]
    print("POSITION (x=",position[0][0],",y=",position[1][0],",z=",position[2][0],")")
    rotation_matrix=data['extrinsic_matrix'][i][:3,:3]
    print("ROTATION:")
    print(rotation_matrix)
 
    fx,fy,fz=rotation_matrix[0,2],rotation_matrix[1,2],rotation_matrix[2,2]
    heading=np.arctan2(-fx,fz)
    elevation=-np.arcsin(fy)
 
    scan_id = 'i5noydFURQK'
    panoram_id = data['pano'][i]
    delta_time = data['time'][i+1] - data['time'][i]
 
    print("Heading =",heading)
    print("Elevation =",elevation)
    render(scan_id, panoram_id, heading, elevation, delta_time)
    print()
    print()
 
 
 
 
data.close()
 

References