Pose Traces
Guide and follower annotations are paired with their corresponding pose traces: a sequence of snapshots that capture the annotator’s virtual camera pose and field-of-view. The naming convention for these files is {instruction_id:06}_guide_pose_trace.npz
for guide annotations and {demonstration_id:06}_follower_pose_trace.npz
for follower annotations.
Data schema:
{'pano': (np.str, [n, 1]),
'time': (np.float32, [n, 1]),
'audio_time': (np.float32, [n, 1]),
'extrinsic_matrix': (np.float32, [n, 16]),
'intrinsic_matrix': (np.float32, [n, 16]),
'image_mask': (np.bool, [k, 128, 256]),
'text_masks': (np.bool, [k, m]),
'feature_weights': (np.float32, [k, 36])}
Where n
is the number of snapshots, k
is the number of panoramic viewpoints in the associated path, and m
is the number of BERT SubWord in the tokenized instructions.
Field descriptions:
pano
: Panoramic viewpoint of the snapshot.time
: Timestamp of the snapshot in seconds.audio_time
: Timestamp corresponding to the follower’s progress listening to the guide’s instruction recording. This is only included in follower pose traces.extrinsic_matrix
: The extrinsic parameters, or pose, of the annotator’s camera.intrinsic_matrix
: The intrinsic parameters, or projection matrix, of the annotator’s camera.image_mask
: Mask indicating the pixels observed in the panorama. This mask is in equirectangular format, with heading angle 0 being the center of the image.text_masks
: Mask indicating the utterances that have been spoken or heard by the guide or follower, respectively, at this panoramic viewpoint.feature_weights
: Animage_mask
in feature space, corresponding to the typical setting in which 36 image features are generated at 12 heading and 3 elevation increments. Each value is a mean-pooled perspective projection of theimage_mask
for a particular heading and elevation.
Note that image_mask
, text_mask
, and feature_weights
are provided solely for convenience, as they can be generated from the other pose trace fields and the timed_instruction
.
Python program to print values
import numpy as np
file="./rxr-data/pose_traces/rxr_val_seen/104668_guide_pose_trace.npz"
data = np.load(file)
for i in range(0,len(data['pano'])):
print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
data.close()
Interpreting the data
Dissecting the Camera Matrix, Part 2: The Extrinsic Matrix Dissecting the Camera Matrix, Part 3: The Intrinsic Matrix Wikipedia: Rotation Matrix Stackoverflow: Rotation Matrix to Euler Angles Stackoverflow: Rotation Matrix to Euler Angles Code
Matterport3D API
Requirements:
- Width
- Height
- Vertical Field Of View
- Horizontal Field Of View
- ScanID
- PanoramID
- Heading
- Elevation
import MatterSim
import time
import math
import cv2
import numpy as np
WIDTH = 1366
HEIGHT = 768
VFOV = math.radians(60)
HFOV = VFOV*WIDTH/HEIGHT
TEXT_COLOR = [230, 40, 40]
cv2.namedWindow('Python RGB')
cv2.namedWindow('Python Depth')
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(VFOV)
sim.setDepthEnabled(True) # Turn on depth only after running ./scripts/depth_to_skybox.py (see README.md)
sim.initialize()
#sim.newEpisode(['2t7WUuJeko7'], ['1e6b606b44df4a6086c0f97e826d4d15'], [0], [0])
#sim.newEpisode(['1LXtFkjw3qL'], ['0b22fa63d0f54a529c525afbf2e8bb25'], [0], [0])
#sim.newRandomEpisode(['i5noydFURQK'])
sim.newEpisode(['i5noydFURQK'], ['41f833ed92c0489bb85a911668189278'], [3.103316889455171], [0])
heading = 0
elevation = 0
location = 0
ANGLEDELTA = 5 * math.pi / 180
print('\nPython Demo')
print('Use arrow keys to move the camera.')
print('Use number keys (not numpad) to move to nearby viewpoints indicated in the RGB view.')
print('Depth outputs are turned off by default - check driver.py:L20 to enable.\n')
while True:
import inspect; import pprint; obj = sim; pprint.pprint(inspect.getmembers(obj))
print(sim)
sim.makeAction([location], [heading], [elevation])
location = 0
heading = 0
elevation = 0
state = sim.getState()[0]
locations = state.navigableLocations
rgb = np.array(state.rgb, copy=False)
#for idx, loc in enumerate(locations[1:]):
# # Draw actions on the screen
# fontScale = 3.0/loc.rel_distance
# x = int(WIDTH/2 + loc.rel_heading/HFOV*WIDTH)
# y = int(HEIGHT/2 - loc.rel_elevation/VFOV*HEIGHT)
# cv2.putText(rgb, str(idx + 1), (x, y), cv2.FONT_HERSHEY_SIMPLEX,
# fontScale, TEXT_COLOR, thickness=3)
cv2.imshow('Python RGB', rgb)
depth = np.array(state.depth, copy=False)
cv2.imshow('Python Depth', depth)
k = cv2.waitKey(1)
if k == -1:
continue
else:
k = (k & 255)
if k == ord('q'):
break
elif ord('1') <= k <= ord('9'):
location = k - ord('0')
if location >= len(locations):
location = 0
elif k == 81 or k == ord('a'):
heading = -ANGLEDELTA
elif k == 82 or k == ord('w'):
elevation = ANGLEDELTA
elif k == 83 or k == ord('d'):
heading = ANGLEDELTA
elif k == 84 or k == ord('s'):
elevation = -ANGLEDELTA
FINAL PROGRAM TO REPLAY DATA
import MatterSim
import time
import math
import cv2
import numpy as np
WIDTH = 1366
HEIGHT = 768
FOV = math.radians(60)
cv2.namedWindow('Python RGB')
sim = MatterSim.Simulator()
sim.setCameraResolution(WIDTH, HEIGHT)
sim.setCameraVFOV(FOV)
sim.initialize()
def render(scan_id: str, panoram_id: str, heading: float, elevation: float, time):
sim.newEpisode([scan_id], [panoram_id], [heading], [elevation])
state = sim.getState()[0]
rgb = np.array(state.rgb, copy=False)
cv2.imshow('Python RGB', rgb)
if time:
cv2.waitKey((int)(time*1000))
else:
cv2.waitKey()
import numpy as np
pi=3.1415926535
#file = sys.argv[1]
#print("File = ",file)
file="/root/mount/rxr-data/pose_traces/rxr_val_seen/000106_guide_pose_trace.npz"
data = np.load(file)
import pprint
#pprint.pprint(data['audio_time'])
for i in range(0,len(data['pano'])):
print(data['pano'][i], data['time'][i], data['extrinsic_matrix'][i])
position=data['extrinsic_matrix'][i][:3,3:]
print("POSITION (x=",position[0][0],",y=",position[1][0],",z=",position[2][0],")")
rotation_matrix=data['extrinsic_matrix'][i][:3,:3]
print("ROTATION:")
print(rotation_matrix)
fx,fy,fz=rotation_matrix[0,2],rotation_matrix[1,2],rotation_matrix[2,2]
heading=np.arctan2(-fx,fz)
elevation=-np.arcsin(fy)
scan_id = 'i5noydFURQK'
panoram_id = data['pano'][i]
delta_time = data['time'][i+1] - data['time'][i]
print("Heading =",heading)
print("Elevation =",elevation)
render(scan_id, panoram_id, heading, elevation, delta_time)
print()
print()
data.close()