class_name copyMediaPipe extends Mod_Base # ----------------------------------------------------------------------------- # Potentially configurable variables. # ----------------------------------------------------------------------------- enum BlendshapeMode { NONE, MEDIA_PIPE, VRM_STANDARD } var blendshape_mode := BlendshapeMode.VRM_STANDARD var arm_rest_angle := 65 var interpolation_factor := 0.000000001 # Yes this value needs to be THAT small. var rest_interpolation_factor := 0.2 # "Lerp about 80% of the way in one second." var min_confidence_threshold := 0.85 var time_to_rest := 0.1 # Time without tracking data before returning to the rest pose. # TODO: Change this via calibration! var camera_transform := Transform3D(Basis(), Vector3(0.0, 0.0, 0.3)) # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # FIXME: Best to get this from the tracker process (if possible). var camera_aspect_ratio := 4.0 / 3.0 # Logitech C920 default? # TODO: Ensure that this works with the model offset from the world origin. var ik_chains: Array[copyMediaPipe_IKChain] = [] @onready var tracking_root: Node3D = $TrackingRoot @onready var head := { last_data = null, # Most recent tracking data received. last_received = INF, # How long ago it was received (in seconds). tracker = $TrackingRoot/Head, # Node for visualizing tracking data. rest_pose = Transform3D.IDENTITY, # Rest position of the head. } @onready var hands := { left = { last_data = null, last_received = INF, tracker = $TrackingRoot/LeftHand, rest_pose = Transform3D.IDENTITY, landmarks = [], }, right = { last_data = null, last_received = INF, tracker = $TrackingRoot/RightHand, rest_pose = Transform3D.IDENTITY, landmarks = [], }, } # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- func _ready() -> void: setup_hand_landmarks() var dir = get_script().get_path().get_base_dir() var path = dir.path_join("_tracker/Project/new_new_tracker.py") python_process = KiriPythonWrapperInstance.new(path) if not python_process.setup_python(false): OS.alert("Failed to setup tracker dependencies!") start_process() # FIXME: Don't hardcode the video device. set_video_device(get_video_devices()[0]) start_tracker() func _exit_tree() -> void: stop_tracker() stop_process() # Called after mod is initialized or model is changed. func scene_init() -> void: initialize_rest_pose() initialize_ik_chains() # Called before mod is removed, model is changed or application is shut down. func scene_shutdown() -> void: ik_chains = [] func _process(delta: float) -> void: increase_last_received(delta) if is_tracker_running(): receive_tracker_packets() update_visual_trackers(delta) update_ik_chains() update_blendshapes() ## Sets up 21 nodes for the landmarks that make up hand/finger tracking. func setup_hand_landmarks() -> void: const landmark_scene := preload("Resources/debug_landmark.tscn") for side in hands: var hand = hands[side] for i in 21: var landmark := landmark_scene.instantiate() hand.tracker.add_child(landmark) hand.landmarks.append(landmark) # ----------------------------------------------------------------------------- # Initialization functions that are called when a new model is loaded. # ----------------------------------------------------------------------------- ## Initialized the stored rest positions for the head and hands. ## Also applies a rotation to the arms so they're not T-posing. func initialize_rest_pose() -> void: var skel := get_skeleton() if not skel: return var head_idx := skel.find_bone("Head") var head_rest := skel.get_bone_global_rest(head_idx) # Move the tracking root such that it is at the height of the head. tracking_root.transform = camera_transform * head_rest head.rest_pose = tracking_root.transform.inverse() * head_rest for side in hands: var shoulder_idx := skel.find_bone(side.capitalize() + "Shoulder") var hand_idx := skel.find_bone(side.capitalize() + "Hand") var shoulder_rest := skel.get_bone_global_rest(shoulder_idx) var hand_rest := skel.get_bone_global_rest(hand_idx) # First, get relative transform of hand to shoulder. var hand_to_shoulder := shoulder_rest.inverse() * hand_rest # Next, rotate this relative transform by arm_rest_angle. hand_to_shoulder = hand_to_shoulder.rotated(Vector3.LEFT, deg_to_rad(arm_rest_angle)) # Finally, put the relative transform back into skeleton-relative coordinates. var new_hand_transform := shoulder_rest * hand_to_shoulder hands[side].rest_pose = tracking_root.transform.inverse() * new_hand_transform ## Sets up the inverse kinematics chains to move the model depending on the location of the visual trackers. func initialize_ik_chains() -> void: ik_chains = [] var chain_spine := copyMediaPipe_IKChain.new() chain_spine.skeleton = get_skeleton() chain_spine.base_bone = "Hips" chain_spine.tip_bone = "Head" chain_spine.rotation_low = 0.0 * TAU chain_spine.rotation_high = 1.0 * TAU chain_spine.do_yaw = true chain_spine.main_axis_of_rotation = Vector3.RIGHT chain_spine.secondary_axis_of_rotation = Vector3.UP chain_spine.pole_direction_target = Vector3.ZERO # No pole target. chain_spine.tracker_object = head.tracker chain_spine.yaw_scale = 0.25 # chest_yaw_scale (Unsure what this does.) ik_chains.append(chain_spine) var x_pole_dist = 10.0 var y_pole_dist = 5.0 var z_pole_dist = 10.0 var arm_rotation_axis = Vector3.UP for side in hands: var hand = hands[side] var chain_hand := copyMediaPipe_IKChain.new() chain_hand.skeleton = get_skeleton() chain_hand.base_bone = side.capitalize() + "UpperArm" chain_hand.tip_bone = side.capitalize() + "Hand" chain_hand.rotation_low = 0.025 * TAU chain_hand.rotation_high = 0.990 * TAU chain_hand.do_yaw = false chain_hand.do_bone_roll = true chain_hand.secondary_axis_of_rotation = Vector3.UP if side == "left": chain_hand.main_axis_of_rotation = -arm_rotation_axis chain_hand.pole_direction_target = Vector3(x_pole_dist, -y_pole_dist, -z_pole_dist) chain_hand.tracker_object = hand.tracker else: chain_hand.main_axis_of_rotation = arm_rotation_axis chain_hand.pole_direction_target = Vector3(-x_pole_dist, -y_pole_dist, -z_pole_dist) chain_hand.tracker_object = hand.tracker ik_chains.append(chain_hand) # ----------------------------------------------------------------------------- # Functions to start/stop the PYTHON TRACKER PROCESS and communicate with it. # ----------------------------------------------------------------------------- var python_process: KiriPythonWrapperInstance func start_process() -> void: python_process.start_process(false) func stop_process() -> void: python_process.stop_process() func is_process_running() -> bool: return python_process.get_status() == KiriPythonWrapperInstance.KiriPythonWrapperStatus.STATUS_RUNNING # [{ name: String, backend: String, path: String, index: int }] func get_video_devices() -> Array: assert(is_process_running()) var devices = python_process.call_rpc_sync("enumerate_camera_devices", []) return devices if devices is Array else [] func set_video_device(device) -> void: assert(is_process_running()) var index: int = device.index if device else -1 python_process.call_rpc_sync("set_video_device_number", [ index ]) # ----------------------------------------------------------------------------- # Functions to start/stop the TRACKER and receive packets coming from it. # ----------------------------------------------------------------------------- var base_port := 7098 var udp_server: PacketPeerUDP var udp_server_port: int func start_tracker() -> void: assert(!is_tracker_running()) udp_server = PacketPeerUDP.new() # Find a port number that's open to use. udp_server_port = base_port while udp_server.bind(udp_server_port, "127.0.0.1") != OK: udp_server_port += 1 python_process.call_rpc_sync("set_udp_port_number", [ udp_server_port ]) python_process.call_rpc_sync("start_tracker", []) func stop_tracker() -> void: if !is_tracker_running(): return # Do nothing if tracker isn't running. python_process.call_rpc_sync("stop_tracker", []) udp_server.close() udp_server = null func is_tracker_running() -> bool: return udp_server != null func receive_tracker_packets() -> void: assert(is_tracker_running()) while true: var bytes := udp_server.get_packet() if bytes.size() == 0: break var data = JSON.parse_string(bytes.get_string_from_utf8()) if data is Dictionary: process_tracker_data(data) # FIXME: Find out why we appear to always be processing 2 packets a frame. # ----------------------------------------------------------------------------- # Functions to PROCESS and CONVERT the incoming TRACKER DATA. # ----------------------------------------------------------------------------- func increase_last_received(delta: float) -> void: head.last_received += delta hands.left.last_received += delta hands.right.last_received += delta func process_tracker_data(data: Dictionary) -> void: if "error" in data: on_tracker_error(data.error); return if "status" in data: on_tracker_status(data.status); return # Convert the arrays inside data to known data types like Vector3 and Transform3D. data["face"]["transform"] = to_transform(data["face"]["transform"]) for side in data["hands"]: var hand = data["hands"][side] # Convert untyped array of arrays to typed Array[Vector3]. var image_landmarks = hand["image_landmarks"].map(to_vector) var world_landmarks = hand["world_landmarks"].map(to_vector) hand["image_landmarks"] = Array(image_landmarks, TYPE_VECTOR3, "", null) hand["world_landmarks"] = Array(world_landmarks, TYPE_VECTOR3, "", null) # Face matrix is in centimeters, convert to meters. data["face"]["transform"].origin /= 100 # NOTE: Face confidence currently either 0.0 or 1.0. if data["face"]["confidence"] > min_confidence_threshold: head.last_data = data["face"] head.last_received = 0.0 for side in hands: var hand = hands[side] var hand_data = data["hands"][side] if hand_data["confidence"] > min_confidence_threshold: var image_landmarks: Array[Vector3] = hand_data["image_landmarks"] var world_landmarks: Array[Vector3] = hand_data["world_landmarks"] # Mirror position on the X axis, since image landmarks are in view space. for i in image_landmarks.size(): image_landmarks[i].x = (1 - image_landmarks[i].x) # Unsure why, but world landmarks might be in a different coordinate system than expected? var rotation_fix := Basis(Vector3.RIGHT, TAU / 2) for i in world_landmarks.size(): world_landmarks[i] = rotation_fix * world_landmarks[i] hand.last_data = hand_data hand.last_received = 0.0 func on_tracker_status(status: String) -> void: set_status(status) func on_tracker_error(error: String) -> void: print_log("Error: " + error) func to_vector(array) -> Vector3: return Vector3(array[0], array[1], array[2]) func to_transform(matrix) -> Transform3D: return Transform3D( Basis(Vector3(matrix[0][0], matrix[1][0], matrix[2][0]), Vector3(matrix[0][1], matrix[1][1], matrix[2][1]), Vector3(matrix[0][2], matrix[1][2], matrix[2][2])), Vector3(matrix[0][3], matrix[1][3], matrix[2][3])) # ----------------------------------------------------------------------------- # Functions for updating VISUAL TRACKERS and THE MODEL itself. # ----------------------------------------------------------------------------- func update_visual_trackers(delta: float) -> void: if head.last_received >= time_to_rest: # Reset to rest pose transform. head.tracker.transform = fi_slerp(head.tracker.transform, head.rest_pose, rest_interpolation_factor, delta) else: head.tracker.transform = fi_slerp(head.tracker.transform, head.last_data["transform"], interpolation_factor, delta) # TODO: Don't automatically trust the handedness of the input data. for side in hands: var hand = hands[side] if hand.last_received >= time_to_rest: # Reset to rest pose transform. hand.tracker.transform = fi_slerp(hand.tracker.transform, hand.rest_pose, rest_interpolation_factor, delta) else: var image_landmarks: Array[Vector3] = hand.last_data["image_landmarks"] var world_landmarks: Array[Vector3] = hand.last_data["world_landmarks"] var hand_rotation := get_hand_rotation(side, world_landmarks) var hand_origin := get_hand_viewspace_origin(image_landmarks, world_landmarks, 2.0) \ * Vector3(7.0, 7.0, 3.5) # FIXME: Fudge factor to match better with world space. var target_transform := Transform3D(hand_rotation, hand_origin) hand.tracker.transform = fi_slerp(hand.tracker.transform, target_transform, interpolation_factor, delta) # Translate landmarks so the origin is at the wrist. var wrist_position := world_landmarks[0] # World landmarks are in world space, so we have to "subtract" the hand rotation. for i in world_landmarks.size(): var pos := world_landmarks[i] - wrist_position hand.landmarks[i].position = hand_rotation.inverse() * pos func update_ik_chains() -> void: for chain in ik_chains: chain.do_ik_chain() func update_blendshapes() -> void: const Blendshapes := preload("res://Mods/MediaPipe/MediaPipeController_BlendShapes.gd") var model := get_model() if (not model) or (not head.last_data): return var data: Dictionary = head.last_data.blendshapes var shape_dict: Dictionary match blendshape_mode: BlendshapeMode.MEDIA_PIPE: shape_dict = data BlendshapeMode.VRM_STANDARD: shape_dict = \ Blendshapes.convert_mediapipe_shapes_to_vrm_standard(data) # TODO: Blendshapes.apply_smoothing(...) Blendshapes.fixup_eyes(shape_dict) Blendshapes.apply_animations(model, shape_dict) # ----------------------------------------------------------------------------- # Utility functions, currently only relating to update_visual_trackers. # ----------------------------------------------------------------------------- # Indices of hand landmarks. const WRIST := 0 const THUMB_CMC := 1 const THUMB_MCP := 2 const THUMB_IP := 3 const THUMB_TIP := 4 const INDEX_FINGER_MCP := 5 const INDEX_FINGER_PIP := 6 const INDEX_FINGER_DIP := 7 const INDEX_FINGER_TIP := 8 const MIDDLE_FINGER_MCP := 9 const MIDDLE_FINGER_PIP := 10 const MIDDLE_FINGER_DIP := 11 const MIDDLE_FINGER_TIP := 12 const RING_FINGER_MCP := 13 const RING_FINGER_PIP := 14 const RING_FINGER_DIP := 15 const RING_FINGER_TIP := 16 const PINKY_MCP := 17 const PINKY_PIP := 18 const PINKY_DIP := 19 const PINKY_TIP := 20 ## Calculate the hand rotation from the hand tracking's world landmarks. func get_hand_rotation(side: String, landmarks: Array[Vector3]) -> Basis: var knuckles_center := (landmarks[INDEX_FINGER_MCP] + landmarks[RING_FINGER_TIP]) / 2 var wrist_to_knuckles := landmarks[WRIST].direction_to(knuckles_center) var towards_thumb := landmarks[RING_FINGER_TIP].direction_to(landmarks[INDEX_FINGER_MCP]) var palm_forward: Vector3 if side == "left": palm_forward = towards_thumb.cross(wrist_to_knuckles) if side == "right": palm_forward = wrist_to_knuckles.cross(towards_thumb) return Basis.looking_at(palm_forward, wrist_to_knuckles) ## Attempt to figure out the hand origin in viewspace. ## `hand_to_head_scale` is a fudge value so that we can attempt ## to force the hand and head into the same scale range, roughly. func get_hand_viewspace_origin( image_landmarks: Array[Vector3], _world_landmarks: Array[Vector3], # unused hand_to_head_scale: float, ) -> Vector3: # Values found through experimentation. var known_distances := [ [ WRIST , THUMB_CMC , 0.053861 ], [ THUMB_CMC , THUMB_MCP , 0.057096 ], [ THUMB_MCP , THUMB_IP , 0.048795 ], [ THUMB_IP , THUMB_TIP , 0.039851 ], [ WRIST , INDEX_FINGER_MCP , 0.152538 ], [ WRIST , RING_FINGER_TIP , 0.138711 ], [ INDEX_FINGER_MCP , MIDDLE_FINGER_MCP , 0.029368 ], [ MIDDLE_FINGER_MCP , MIDDLE_FINGER_TIP , 0.027699 ], [ MIDDLE_FINGER_TIP , RING_FINGER_TIP , 0.032673 ], ] # FIXME: Hardcoded fudge-factor for d in known_distances: d[2] *= 0.25 # Iterate through known distances and add up the weighted average. var fake_z_avg := 0.0 var total_avg_weight := 0.0 for d in known_distances: var pt0 := image_landmarks[d[0]] var pt1 := image_landmarks[d[1]] # Figure out a weighted average based on how much the vector # is facing the camera Z axis. Stuff facing into the camera # has less accurate results, so weight it lower. var normvec := (pt0 - pt1).normalized() var weight := clampf(1.0 - 2.0 * abs(normvec[2]), 0.0, 1.0) # Add to the average. fake_z_avg += guess_depth_from_known_distance( pt0, pt1, d[2] / hand_to_head_scale) * weight total_avg_weight += weight if abs(total_avg_weight) < 0.000001: print("HEY THE THING HAPPENED", total_avg_weight) # FIXME: Fudge value because I'm tired of this thing throwing # exceptions all the time. Do an actual fix later. total_avg_weight = 0.01 # Finish the average. fake_z_avg = fake_z_avg / total_avg_weight return ndc_to_viewspace(image_landmarks[0], -fake_z_avg) ## Figure out a depth value based on the distance between known ## normalized (clip-space) coordinates of landmarks, compared to what ## we would expect the average distance between those points to be. func guess_depth_from_known_distance(left: Vector3, right: Vector3, distance: float) -> float: var dist_clip := left - right dist_clip.x *= camera_aspect_ratio # FIXME: Fudge factor return 1.0 / (dist_clip.length() / distance) func ndc_to_viewspace(v: Vector3, z_offset: float) -> Vector3: # This (px, py) is pretty important and Google's # documentation didn't give much useful info about it. var px := 0.5 var py := 0.5 # These default to 1.0, 1.0 according to Google's docs. # I guess that's probably fine for default camera stuff. var fx := 1.0 var fy := camera_aspect_ratio # Inverse equation from the section on NDC space here # https://google.github.io/mediapipe/solutions/objectron.html#coordinate-systems # https://web.archive.org/web/20220727063132/https://google.github.io/mediapipe/solutions/objectron.html#coordinate-systems # which describes going from camera coordinates to NDC space. It's kinda # ambiguous on terms, but this seems to work to get view space coordinates. # With this, coordinates seem to be evenly scaled (between x/y and z) and in view space. var z_scale := 1.0 var z := 1.0 / (-v[2] + (1.0 / z_offset) * z_scale) var x := (v[0] - px) * z / fx var y := (v[1] - py) * z / fy return Vector3(x, y, z) ## Smoothly interpolates transforms in a framerate-independent way. ## For example, using a factor of 0.2, will move roughly 80% of the remaining distance in a second. func fi_slerp(value: Transform3D, target: Transform3D, factor: float, delta: float) -> Transform3D: return value.interpolate_with(target, 1 - factor ** delta)