You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
335 lines
13 KiB
335 lines
13 KiB
class_name copyMediaPipe |
|
extends Mod_Base |
|
|
|
# FIXME: Best to get this from the tracker process (if possible). |
|
var camera_aspect_ratio := 4.0 / 3.0 # Logitech C920 default? |
|
|
|
@onready var tracker_head : Node3D = $TrackingRoot/Head |
|
@onready var tracker_hand_left : Node3D = $TrackingRoot/LeftHand |
|
@onready var tracker_hand_right : Node3D = $TrackingRoot/RightHand |
|
@onready var landmark_template : MeshInstance3D = $TrackingRoot/LandmarkTemplate |
|
@onready var landmarks_hand_left : Array[MeshInstance3D] = [] |
|
@onready var landmarks_hand_right : Array[MeshInstance3D] = [] |
|
|
|
@onready var hands := { |
|
left = { |
|
tracker = tracker_hand_left, |
|
landmarks = landmarks_hand_left, |
|
}, |
|
right = { |
|
tracker = tracker_hand_right, |
|
landmarks = landmarks_hand_right, |
|
}, |
|
} |
|
|
|
func _ready() -> void: |
|
setup_hand_landmarks() |
|
|
|
var dir = get_script().get_path().get_base_dir() |
|
var path = dir.path_join("_tracker/Project/new_new_tracker.py") |
|
python_process = KiriPythonWrapperInstance.new(path) |
|
if not python_process.setup_python(false): |
|
OS.alert("Failed to setup tracker dependencies!") |
|
|
|
start_process() |
|
# FIXME: Don't hardcode the video device. |
|
set_video_device(get_video_devices()[0]) |
|
start_tracker() |
|
|
|
func _exit_tree() -> void: |
|
stop_tracker() |
|
stop_process() |
|
|
|
# Called after mod is initialized or model is changed. |
|
func scene_init(): |
|
pass |
|
|
|
# Called before mod is removed, model is changed or application is shut down. |
|
func scene_shutdown(): |
|
pass |
|
|
|
func _process(_delta: float) -> void: |
|
if is_tracker_running(): |
|
receive_tracker_packets() |
|
|
|
func setup_hand_landmarks() -> void: |
|
for side in hands: |
|
var hand = hands[side] |
|
for i in 21: |
|
var landmark: MeshInstance3D = landmark_template.duplicate(0) |
|
landmark.position = Vector3.ZERO |
|
landmark.visible = true |
|
hand.tracker.add_child(landmark) |
|
hand.landmarks.append(landmark) |
|
|
|
# ----------------------------------------------------------------------------- |
|
# Functions to start/stop the PYTHON TRACKER PROCESS and communicate with it. |
|
# ----------------------------------------------------------------------------- |
|
|
|
var python_process: KiriPythonWrapperInstance |
|
|
|
func start_process() -> void: |
|
python_process.start_process(false) |
|
|
|
func stop_process() -> void: |
|
python_process.stop_process() |
|
|
|
func is_process_running() -> bool: |
|
return python_process.get_status() == KiriPythonWrapperInstance.KiriPythonWrapperStatus.STATUS_RUNNING |
|
|
|
# [{ name: String, backend: String, path: String, index: int }] |
|
func get_video_devices() -> Array: |
|
assert(is_process_running()) |
|
var devices = python_process.call_rpc_sync("enumerate_camera_devices", []) |
|
return devices if devices is Array else [] |
|
|
|
func set_video_device(device) -> void: |
|
assert(is_process_running()) |
|
var index: int = device.index if device else -1 |
|
python_process.call_rpc_sync("set_video_device_number", [ index ]) |
|
|
|
# ----------------------------------------------------------------------------- |
|
# Functions to start/stop the TRACKER and receive packets coming from it. |
|
# ----------------------------------------------------------------------------- |
|
|
|
var base_port := 7098 |
|
var udp_server: PacketPeerUDP |
|
var udp_server_port: int |
|
|
|
func start_tracker() -> void: |
|
assert(!is_tracker_running()) |
|
|
|
udp_server = PacketPeerUDP.new() |
|
# Find a port number that's open to use. |
|
udp_server_port = base_port |
|
while udp_server.bind(udp_server_port, "127.0.0.1") != OK: |
|
udp_server_port += 1 |
|
|
|
python_process.call_rpc_sync("set_udp_port_number", [ udp_server_port ]) |
|
python_process.call_rpc_sync("start_tracker", []) |
|
|
|
func stop_tracker() -> void: |
|
if !is_tracker_running(): return # Do nothing if tracker isn't running. |
|
python_process.call_rpc_sync("stop_tracker", []) |
|
udp_server.close() |
|
udp_server = null |
|
|
|
func is_tracker_running() -> bool: |
|
return udp_server != null |
|
|
|
func receive_tracker_packets() -> void: |
|
assert(is_tracker_running()) |
|
while true: |
|
var bytes := udp_server.get_packet() |
|
if bytes.size() == 0: break |
|
var data = JSON.parse_string(bytes.get_string_from_utf8()) |
|
if data is Dictionary: process_tracker_data(data) |
|
|
|
# ----------------------------------------------------------------------------- |
|
# Functions to PROCESS the incoming TRACKER DATA, and update tracker objects. |
|
# ----------------------------------------------------------------------------- |
|
|
|
func process_tracker_data(data: Dictionary) -> void: |
|
if "error" in data: on_tracker_error(data.error); return |
|
if "status" in data: on_tracker_status(data.status); return |
|
convert_tracker_data(data) |
|
|
|
# MediaPipe reports hands from a viewer's perspective, not the |
|
# person's own actual left and right hand, so swap them out here. |
|
var left = data["hands"]["left"] |
|
var right = data["hands"]["right"] |
|
data["hands"]["left"] = right |
|
data["hands"]["right"] = left |
|
|
|
tracker_head.transform = data["face"]["transform"] |
|
tracker_head.position /= 100 # Centimeters to meters. |
|
|
|
# TODO: Actually use this. |
|
var num_hands_detected := 0 |
|
for side in hands: |
|
var hand = hands[side] |
|
var tracker: Node3D = hand.tracker |
|
|
|
# TODO: Don't automatically trust the handedness of the input data. |
|
var hand_data = data["hands"][side] |
|
var image_landmarks: Array[Vector3] = hand_data["image_landmarks"] |
|
var world_landmarks: Array[Vector3] = hand_data["world_landmarks"] |
|
|
|
# FIXME: Make this configurable. |
|
var min_confidence_threshold := 0.85 |
|
if hand_data["confidence"] < min_confidence_threshold: continue |
|
num_hands_detected += 1 |
|
|
|
# Mirror position on the X axis, since image landmarks are in view space. |
|
for i in image_landmarks.size(): image_landmarks[i].x = (1 - image_landmarks[i].x) |
|
|
|
tracker.basis = get_hand_rotation(world_landmarks) |
|
tracker.position = get_hand_viewspace_origin(image_landmarks, world_landmarks, 2.0) \ |
|
* Vector3(7.0, 7.0, 3.5) # FIXME: Fudge factor to match better with world space. |
|
|
|
# Translate landmarks so the origin is at the wrist. |
|
var wrist_position := world_landmarks[0] |
|
# World landmarks are in world space, so we have to "subtract" the hand rotation. |
|
# Also, the rotation is all wrong, so apply that here as well. |
|
var hand_rotation := tracker.basis.inverse() * Basis.from_euler(Vector3(TAU / 2, 0, 0)) |
|
for i in world_landmarks.size(): |
|
var pos := world_landmarks[i] - wrist_position |
|
hand.landmarks[i].position = hand_rotation * pos |
|
|
|
# TODO: Interpolation needs to be done outside of this function, |
|
# as it could be called multiple times a frame, or not at all. |
|
|
|
# Smoothly interpolate tracker transforms (in a framerate-independent way). |
|
# var f := 0.0000000001 # Yes this value needs to be THAT small. |
|
# tracker_head .transform = tracker_head .transform.interpolate_with(head_transform , 1 - f ** delta) |
|
# tracker_hand_left .transform = tracker_hand_left .transform.interpolate_with(hand_left_transform , 1 - f ** delta) |
|
# tracker_hand_right.transform = tracker_hand_right.transform.interpolate_with(hand_right_transform, 1 - f ** delta) |
|
|
|
func on_tracker_status(status: String) -> void: |
|
set_status(status) |
|
|
|
func on_tracker_error(error: String) -> void: |
|
print_log("Error: " + error) |
|
|
|
# ----------------------------------------------------------------------------- |
|
# Functions that deal with CONVERTING the TRACKER DATA to Godot types. |
|
# ----------------------------------------------------------------------------- |
|
|
|
## Converts the arrays inside data to known data types like Vector3 and Transform3D. |
|
func convert_tracker_data(data: Dictionary) -> void: |
|
data["face"]["transform"] = to_transform(data["face"]["transform"]) |
|
for side in data["hands"]: |
|
var hand = data["hands"][side] |
|
# Convert untyped array of arrays to typed Array[Vector3]. |
|
var image_landmarks = hand["image_landmarks"].map(to_vector) |
|
var world_landmarks = hand["world_landmarks"].map(to_vector) |
|
hand["image_landmarks"] = Array(image_landmarks, TYPE_VECTOR3, "", null) |
|
hand["world_landmarks"] = Array(world_landmarks, TYPE_VECTOR3, "", null) |
|
|
|
func to_vector(array) -> Vector3: |
|
return Vector3(array[0], array[1], array[2]) |
|
|
|
func to_transform(matrix) -> Transform3D: |
|
return Transform3D( |
|
Basis(Vector3(matrix[0][0], matrix[1][0], matrix[2][0]), |
|
Vector3(matrix[0][1], matrix[1][1], matrix[2][1]), |
|
Vector3(matrix[0][2], matrix[1][2], matrix[2][2])), |
|
Vector3(matrix[0][3], matrix[1][3], matrix[2][3])) |
|
|
|
# ----------------------------------------------------------------------------- |
|
# ----------------------------------------------------------------------------- |
|
|
|
const WRIST := 0 |
|
const THUMB_CMC := 1 |
|
const THUMB_MCP := 2 |
|
const THUMB_IP := 3 |
|
const THUMB_TIP := 4 |
|
const INDEX_FINGER_MCP := 5 |
|
const INDEX_FINGER_PIP := 6 |
|
const INDEX_FINGER_DIP := 7 |
|
const INDEX_FINGER_TIP := 8 |
|
const MIDDLE_FINGER_MCP := 9 |
|
const MIDDLE_FINGER_PIP := 10 |
|
const MIDDLE_FINGER_DIP := 12 |
|
const MIDDLE_FINGER_TIP := 13 |
|
const RING_FINGER_MCP := 14 |
|
const RING_FINGER_PIP := 15 |
|
const RING_FINGER_DIP := 16 |
|
const RING_FINGER_TIP := 17 |
|
const PINKY_MCP := 18 |
|
const PINKY_PIP := 19 |
|
const PINKY_DIP := 20 |
|
const PINKY_TIP := 21 |
|
|
|
# FIXME: I changed the way this was calculated and it doesn't quite fit the data right? |
|
func get_hand_rotation(landmarks: Array[Vector3]) -> Basis: |
|
var knuckles_center := (landmarks[INDEX_FINGER_MCP] + landmarks[RING_FINGER_TIP]) / 2 |
|
var wrist_to_knuckles := landmarks[WRIST].direction_to(knuckles_center) |
|
var towards_thumb := landmarks[RING_FINGER_TIP].direction_to(landmarks[INDEX_FINGER_MCP]) |
|
|
|
var up := wrist_to_knuckles.cross(towards_thumb) |
|
return Basis.looking_at(wrist_to_knuckles, up, true) |
|
|
|
## Attempt to figure out the hand origin in viewspace. |
|
## `hand_to_head_scale` is a fudge value so that we can attempt |
|
## to force the hand and head into the same scale range, roughly. |
|
func get_hand_viewspace_origin( |
|
image_landmarks: Array[Vector3], |
|
_world_landmarks: Array[Vector3], |
|
hand_to_head_scale: float, |
|
) -> Vector3: |
|
# Values found through experimentation. |
|
var known_distances := [ |
|
[ WRIST , THUMB_CMC , 0.053861 ], |
|
[ THUMB_CMC , THUMB_MCP , 0.057096 ], |
|
[ THUMB_MCP , THUMB_IP , 0.048795 ], |
|
[ THUMB_IP , THUMB_TIP , 0.039851 ], |
|
[ WRIST , INDEX_FINGER_MCP , 0.152538 ], |
|
[ WRIST , RING_FINGER_TIP , 0.138711 ], |
|
[ INDEX_FINGER_MCP , MIDDLE_FINGER_MCP , 0.029368 ], |
|
[ MIDDLE_FINGER_MCP , MIDDLE_FINGER_TIP , 0.027699 ], |
|
[ MIDDLE_FINGER_TIP , RING_FINGER_TIP , 0.032673 ], |
|
] |
|
# FIXME: Hardcoded fudge-factor |
|
for d in known_distances: d[2] *= 0.25 |
|
|
|
# Iterate through known distances and add up the weighted average. |
|
var fake_z_avg := 0.0 |
|
var total_avg_weight := 0.0 |
|
for d in known_distances: |
|
var pt0 := image_landmarks[d[0]] |
|
var pt1 := image_landmarks[d[1]] |
|
|
|
# Figure out a weighted average based on how much the vector |
|
# is facing the camera Z axis. Stuff facing into the camera |
|
# has less accurate results, so weight it lower. |
|
var normvec := (pt0 - pt1).normalized() |
|
var weight := clampf(1.0 - 2.0 * abs(normvec[2]), 0.0, 1.0) |
|
|
|
# Add to the average. |
|
fake_z_avg += guess_depth_from_known_distance( |
|
pt0, pt1, d[2] / hand_to_head_scale) * weight |
|
total_avg_weight += weight |
|
|
|
if abs(total_avg_weight) < 0.000001: |
|
print("HEY THE THING HAPPENED", total_avg_weight) |
|
# FIXME: Fudge value because I'm tired of this thing throwing |
|
# exceptions all the time. Do an actual fix later. |
|
total_avg_weight = 0.01 |
|
|
|
# Finish the average. |
|
fake_z_avg = fake_z_avg / total_avg_weight |
|
|
|
return ndc_to_viewspace(image_landmarks[0], -fake_z_avg) |
|
|
|
## Figure out a depth value based on the distance between known |
|
## normalized (clip-space) coordinates of landmarks, compared to what |
|
## we would expect the average distance between those points to be. |
|
func guess_depth_from_known_distance(left: Vector3, right: Vector3, distance: float) -> float: |
|
var dist_clip := left - right |
|
dist_clip.x *= camera_aspect_ratio # FIXME: Fudge factor |
|
return 1.0 / (dist_clip.length() / distance) |
|
|
|
func ndc_to_viewspace(v: Vector3, z_offset: float) -> Vector3: |
|
# This (px, py) is pretty important and Google's |
|
# documentation didn't give much useful info about it. |
|
var px := 0.5 |
|
var py := 0.5 |
|
|
|
# These default to 1.0, 1.0 according to Google's docs. |
|
# I guess that's probably fine for default camera stuff. |
|
var fx := 1.0 |
|
var fy := camera_aspect_ratio |
|
|
|
# Inverse equation from the section on NDC space here |
|
# https://google.github.io/mediapipe/solutions/objectron.html#coordinate-systems |
|
# https://web.archive.org/web/20220727063132/https://google.github.io/mediapipe/solutions/objectron.html#coordinate-systems |
|
# which describes going from camera coordinates to NDC space. It's kinda |
|
# ambiguous on terms, but this seems to work to get view space coordinates. |
|
|
|
# With this, coordinates seem to be evenly scaled (between x/y and z) and in view space. |
|
var z_scale := 1.0 |
|
var z := 1.0 / (-v[2] + (1.0 / z_offset) * z_scale) |
|
var x := (v[0] - px) * z / fx |
|
var y := (v[1] - py) * z / fy |
|
return Vector3(x, y, z)
|
|
|