From f49865666bb3e5d3a39424fed291ab0440151a20 Mon Sep 17 00:00:00 2001 From: copygirl Date: Mon, 16 Dec 2024 23:12:12 +0100 Subject: [PATCH] Implement rest post and fixed hand rotations --- copyMediaPipe.gd | 235 +++++++++++++++++++++++++++++---------------- copyMediaPipe.tscn | 6 +- 2 files changed, 155 insertions(+), 86 deletions(-) diff --git a/copyMediaPipe.gd b/copyMediaPipe.gd index b6963f3..e4723bd 100644 --- a/copyMediaPipe.gd +++ b/copyMediaPipe.gd @@ -1,24 +1,41 @@ class_name copyMediaPipe extends Mod_Base +var arm_rest_angle := 65 +var time_to_rest := 0.1 # Time without tracking data before returning to the rest pose. +var interpolation_factor := 0.000000001 # Yes this value needs to be THAT small. +var rest_interpolation_factor := 0.2 # "Lerp about 80% in one second." + +# TODO: Change this via calibration! +var camera_transform := Transform3D(Basis(), Vector3(0.0, 0.0, 0.3)) + # FIXME: Best to get this from the tracker process (if possible). var camera_aspect_ratio := 4.0 / 3.0 # Logitech C920 default? -@onready var tracker_head : Node3D = $TrackingRoot/Head -@onready var tracker_hand_left : Node3D = $TrackingRoot/LeftHand -@onready var tracker_hand_right : Node3D = $TrackingRoot/RightHand -@onready var landmark_template : MeshInstance3D = $TrackingRoot/LandmarkTemplate -@onready var landmarks_hand_left : Array[MeshInstance3D] = [] -@onready var landmarks_hand_right : Array[MeshInstance3D] = [] +@onready var tracking_root: Node3D = $TrackingRoot +@onready var landmark_template: MeshInstance3D = $TrackingRoot/LandmarkTemplate + +@onready var head := { + last_data = null, # Most recent tracking data received. + last_received = INF, # How long ago it was received (in seconds). + tracker = $TrackingRoot/Head, # Node for visualizing tracking data. + rest_pose = Transform3D.IDENTITY, # Rest position of the head (from 0,0,0). +} @onready var hands := { left = { - tracker = tracker_hand_left, - landmarks = landmarks_hand_left, + last_data = null, + last_received = INF, + tracker = $TrackingRoot/LeftHand, + rest_pose = Transform3D.IDENTITY, + landmarks = [], }, right = { - tracker = tracker_hand_right, - landmarks = landmarks_hand_right, + last_data = null, + last_received = INF, + tracker = $TrackingRoot/RightHand, + rest_pose = Transform3D.IDENTITY, + landmarks = [], }, } @@ -41,17 +58,20 @@ func _exit_tree() -> void: stop_process() # Called after mod is initialized or model is changed. -func scene_init(): - pass +func scene_init() -> void: + initialize_rest_pose() # Called before mod is removed, model is changed or application is shut down. -func scene_shutdown(): +func scene_shutdown() -> void: pass -func _process(_delta: float) -> void: +func _process(delta: float) -> void: + increase_last_received(delta) if is_tracker_running(): receive_tracker_packets() + update_visual_trackers(delta) +## Sets up 21 nodes for the landmarks that make up hand/finger tracking. func setup_hand_landmarks() -> void: for side in hands: var hand = hands[side] @@ -62,6 +82,33 @@ func setup_hand_landmarks() -> void: hand.tracker.add_child(landmark) hand.landmarks.append(landmark) +## Initialized the stored rest positions for the head and hands. +## Also applies a rotation to the arms so they're not T-posing. +func initialize_rest_pose() -> void: + var skel := get_skeleton() + if not skel: return + + var head_idx := skel.find_bone("Head") + var head_origin := skel.get_bone_global_rest(head_idx).origin + + tracking_root.transform = camera_transform * Transform3D(Basis(), head_origin) + head.rest_pose = camera_transform.inverse() + + for side in hands: + var shoulder_idx := skel.find_bone(side.capitalize() + "Shoulder") + var hand_idx := skel.find_bone(side.capitalize() + "Hand") + var shoulder_transform := skel.get_bone_global_rest(shoulder_idx) + var hand_transform := skel.get_bone_global_rest(hand_idx) + + # First, get relative transform of hand to shoulder. + var hand_to_shoulder := shoulder_transform.inverse() * hand_transform + # Next, rotate this relative transform by arm_rest_angle. + hand_to_shoulder = hand_to_shoulder.rotated(Vector3.LEFT, deg_to_rad(arm_rest_angle)) + # Finally, put the relative transform back into skeleton-relative coordinates. + var hand_rest_transform := shoulder_transform * hand_to_shoulder + + hands[side].rest_pose = tracking_root.transform.inverse() * hand_rest_transform + # ----------------------------------------------------------------------------- # Functions to start/stop the PYTHON TRACKER PROCESS and communicate with it. # ----------------------------------------------------------------------------- @@ -124,66 +171,58 @@ func receive_tracker_packets() -> void: if bytes.size() == 0: break var data = JSON.parse_string(bytes.get_string_from_utf8()) if data is Dictionary: process_tracker_data(data) + # FIXME: Find out why we appear to always be processing 2 packets a frame. # ----------------------------------------------------------------------------- -# Functions to PROCESS the incoming TRACKER DATA, and update tracker objects. +# Functions to PROCESS and CONVERT the incoming TRACKER DATA. # ----------------------------------------------------------------------------- +func increase_last_received(delta: float) -> void: + head.last_received += delta + hands.left.last_received += delta + hands.right.last_received += delta + func process_tracker_data(data: Dictionary) -> void: if "error" in data: on_tracker_error(data.error); return if "status" in data: on_tracker_status(data.status); return - convert_tracker_data(data) - # MediaPipe reports hands from a viewer's perspective, not the - # person's own actual left and right hand, so swap them out here. - var left = data["hands"]["left"] - var right = data["hands"]["right"] - data["hands"]["left"] = right - data["hands"]["right"] = left + # Convert the arrays inside data to known data types like Vector3 and Transform3D. + data["face"]["transform"] = to_transform(data["face"]["transform"]) + for side in data["hands"]: + var hand = data["hands"][side] + # Convert untyped array of arrays to typed Array[Vector3]. + var image_landmarks = hand["image_landmarks"].map(to_vector) + var world_landmarks = hand["world_landmarks"].map(to_vector) + hand["image_landmarks"] = Array(image_landmarks, TYPE_VECTOR3, "", null) + hand["world_landmarks"] = Array(world_landmarks, TYPE_VECTOR3, "", null) + + # Face matrix is in centimeters, convert to meters. + data["face"]["transform"].origin /= 100 + + # TODO: Make this configurable. + var min_confidence_threshold := 0.85 + + # NOTE: Face confidence currently either 0.0 or 1.0. - tracker_head.transform = data["face"]["transform"] - tracker_head.position /= 100 # Centimeters to meters. + if data["face"]["confidence"] > min_confidence_threshold: + head.last_data = data["face"] + head.last_received = 0.0 - # TODO: Actually use this. - var num_hands_detected := 0 for side in hands: var hand = hands[side] - var tracker: Node3D = hand.tracker - - # TODO: Don't automatically trust the handedness of the input data. var hand_data = data["hands"][side] - var image_landmarks: Array[Vector3] = hand_data["image_landmarks"] - var world_landmarks: Array[Vector3] = hand_data["world_landmarks"] - - # FIXME: Make this configurable. - var min_confidence_threshold := 0.85 - if hand_data["confidence"] < min_confidence_threshold: continue - num_hands_detected += 1 - - # Mirror position on the X axis, since image landmarks are in view space. - for i in image_landmarks.size(): image_landmarks[i].x = (1 - image_landmarks[i].x) - - tracker.basis = get_hand_rotation(world_landmarks) - tracker.position = get_hand_viewspace_origin(image_landmarks, world_landmarks, 2.0) \ - * Vector3(7.0, 7.0, 3.5) # FIXME: Fudge factor to match better with world space. - - # Translate landmarks so the origin is at the wrist. - var wrist_position := world_landmarks[0] - # World landmarks are in world space, so we have to "subtract" the hand rotation. - # Also, the rotation is all wrong, so apply that here as well. - var hand_rotation := tracker.basis.inverse() * Basis.from_euler(Vector3(TAU / 2, 0, 0)) - for i in world_landmarks.size(): - var pos := world_landmarks[i] - wrist_position - hand.landmarks[i].position = hand_rotation * pos - - # TODO: Interpolation needs to be done outside of this function, - # as it could be called multiple times a frame, or not at all. - - # Smoothly interpolate tracker transforms (in a framerate-independent way). - # var f := 0.0000000001 # Yes this value needs to be THAT small. - # tracker_head .transform = tracker_head .transform.interpolate_with(head_transform , 1 - f ** delta) - # tracker_hand_left .transform = tracker_hand_left .transform.interpolate_with(hand_left_transform , 1 - f ** delta) - # tracker_hand_right.transform = tracker_hand_right.transform.interpolate_with(hand_right_transform, 1 - f ** delta) + if hand_data["confidence"] > min_confidence_threshold: + var image_landmarks: Array[Vector3] = hand_data["image_landmarks"] + var world_landmarks: Array[Vector3] = hand_data["world_landmarks"] + + # Mirror position on the X axis, since image landmarks are in view space. + for i in image_landmarks.size(): image_landmarks[i].x = (1 - image_landmarks[i].x) + # Unsure why, but world landmarks might be in a different coordinate system than expected? + var rotation_fix := Basis(Vector3.RIGHT, TAU / 2) + for i in world_landmarks.size(): world_landmarks[i] = rotation_fix * world_landmarks[i] + + hand.last_data = hand_data + hand.last_received = 0.0 func on_tracker_status(status: String) -> void: set_status(status) @@ -191,31 +230,60 @@ func on_tracker_status(status: String) -> void: func on_tracker_error(error: String) -> void: print_log("Error: " + error) -# ----------------------------------------------------------------------------- -# Functions that deal with CONVERTING the TRACKER DATA to Godot types. -# ----------------------------------------------------------------------------- - -## Converts the arrays inside data to known data types like Vector3 and Transform3D. -func convert_tracker_data(data: Dictionary) -> void: - data["face"]["transform"] = to_transform(data["face"]["transform"]) - for side in data["hands"]: - var hand = data["hands"][side] - # Convert untyped array of arrays to typed Array[Vector3]. - var image_landmarks = hand["image_landmarks"].map(to_vector) - var world_landmarks = hand["world_landmarks"].map(to_vector) - hand["image_landmarks"] = Array(image_landmarks, TYPE_VECTOR3, "", null) - hand["world_landmarks"] = Array(world_landmarks, TYPE_VECTOR3, "", null) - func to_vector(array) -> Vector3: return Vector3(array[0], array[1], array[2]) func to_transform(matrix) -> Transform3D: return Transform3D( Basis(Vector3(matrix[0][0], matrix[1][0], matrix[2][0]), - Vector3(matrix[0][1], matrix[1][1], matrix[2][1]), - Vector3(matrix[0][2], matrix[1][2], matrix[2][2])), + Vector3(matrix[0][1], matrix[1][1], matrix[2][1]), + Vector3(matrix[0][2], matrix[1][2], matrix[2][2])), Vector3(matrix[0][3], matrix[1][3], matrix[2][3])) +# ----------------------------------------------------------------------------- +# Functions that take the CONVERTED DATA and update the VISUAL TRACKER nodes. +# ----------------------------------------------------------------------------- + +func update_visual_trackers(delta: float) -> void: + if head.last_received >= time_to_rest: + # Reset to rest pose transform. + head.tracker.transform = fi_slerp(head.tracker.transform, + head.rest_pose, rest_interpolation_factor, delta) + else: + head.tracker.transform = fi_slerp(head.tracker.transform, + head.last_data["transform"], interpolation_factor, delta) + + # TODO: Don't automatically trust the handedness of the input data. + for side in hands: + var hand = hands[side] + if hand.last_received >= time_to_rest: + # Reset to rest pose transform. + hand.tracker.transform = fi_slerp(hand.tracker.transform, + hand.rest_pose, rest_interpolation_factor, delta) + else: + var image_landmarks: Array[Vector3] = hand.last_data["image_landmarks"] + var world_landmarks: Array[Vector3] = hand.last_data["world_landmarks"] + + var hand_rotation := get_hand_rotation(side, world_landmarks) + var hand_origin := get_hand_viewspace_origin(image_landmarks, world_landmarks, 2.0) \ + * Vector3(7.0, 7.0, 3.5) # FIXME: Fudge factor to match better with world space. + + var target_transform := Transform3D(hand_rotation, hand_origin) + hand.tracker.transform = fi_slerp(hand.tracker.transform, + target_transform, interpolation_factor, delta) + + # Translate landmarks so the origin is at the wrist. + var wrist_position := world_landmarks[0] + # World landmarks are in world space, so we have to "subtract" the hand rotation. + for i in world_landmarks.size(): + var pos := world_landmarks[i] - wrist_position + hand.landmarks[i].position = hand_rotation.inverse() * pos + +## Smoothly interpolates transforms in a framerate-independent way. +## For example, using a factor of 0.2, will move roughly 80% of the remaining distance in a second. +func fi_slerp(value: Transform3D, target: Transform3D, factor: float, delta: float) -> Transform3D: + return value.interpolate_with(target, 1 - factor ** delta) + # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- @@ -241,14 +309,17 @@ const PINKY_PIP := 19 const PINKY_DIP := 20 const PINKY_TIP := 21 -# FIXME: I changed the way this was calculated and it doesn't quite fit the data right? -func get_hand_rotation(landmarks: Array[Vector3]) -> Basis: +## Calculate the hand rotation from the hand tracking's world landmarks. +func get_hand_rotation(side: String, landmarks: Array[Vector3]) -> Basis: var knuckles_center := (landmarks[INDEX_FINGER_MCP] + landmarks[RING_FINGER_TIP]) / 2 var wrist_to_knuckles := landmarks[WRIST].direction_to(knuckles_center) var towards_thumb := landmarks[RING_FINGER_TIP].direction_to(landmarks[INDEX_FINGER_MCP]) - var up := wrist_to_knuckles.cross(towards_thumb) - return Basis.looking_at(wrist_to_knuckles, up, true) + var palm_forward: Vector3 + if side == "left": palm_forward = towards_thumb.cross(wrist_to_knuckles) + if side == "right": palm_forward = wrist_to_knuckles.cross(towards_thumb) + + return Basis.looking_at(palm_forward, wrist_to_knuckles) ## Attempt to figure out the hand origin in viewspace. ## `hand_to_head_scale` is a fudge value so that we can attempt diff --git a/copyMediaPipe.tscn b/copyMediaPipe.tscn index 752dffd..92644ba 100644 --- a/copyMediaPipe.tscn +++ b/copyMediaPipe.tscn @@ -21,23 +21,21 @@ rings = 3 script = ExtResource("1_0kpr8") [node name="TrackingRoot" type="Node3D" parent="."] -transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1.5, 0.5) [node name="DebugVisuals" parent="TrackingRoot" instance=ExtResource("2_8wmot")] [node name="Head" type="MeshInstance3D" parent="TrackingRoot"] -transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -0.3) mesh = SubResource("BoxMesh_wtdv4") [node name="DebugVisuals" parent="TrackingRoot/Head" instance=ExtResource("2_8wmot")] [node name="LeftHand" type="Node3D" parent="TrackingRoot"] -transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, -0.5, 0, -0.3) +transform = Transform3D(-4.37114e-08, 1, -4.37114e-08, 0, -4.37114e-08, -1, -1, -4.37114e-08, 1.91069e-15, 0.5, 0, 0) [node name="DebugVisuals" parent="TrackingRoot/LeftHand" instance=ExtResource("2_8wmot")] [node name="RightHand" type="Node3D" parent="TrackingRoot"] -transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 0.5, 0, -0.3) +transform = Transform3D(1.91069e-15, -1, 4.37114e-08, -4.37114e-08, -4.37114e-08, -1, 1, 0, -4.37114e-08, -0.5, 0, 0) [node name="DebugVisuals" parent="TrackingRoot/RightHand" instance=ExtResource("2_8wmot")]