动作捕捉技术:让数字人动起来的核心技能
这篇文章帮你理解动作捕捉
数字人除了要有好看的脸,还要会动才行。动作捕捉就是把你的动作”复制”到数字人身上的技术。读完这篇,你会明白动作捕捉是怎么工作的,以及怎么用各种工具实现从摄像头捕捉动作来驱动数字人。
先搞清楚:什么是动作捕捉?
你有没有看过那种很炫酷的幕后花絮?演员穿着满身是球的紧身衣,在绿幕前做各种动作。然后后期人员把这些动作”绑定”到电脑上制作出的角色身上,让角色做出跟演员一样的动作。这就是动作捕捉(Motion Capture,简称MoCap)。
简单来说,动作捕捉就是:记录真人的动作,然后把这些动作”转移”到数字人身上。
动作捕捉有什么用?
- 游戏动画:你看到游戏角色的动作,很多都是真人动捕后绑定的
- 电影特效:阿凡达、猩球崛起等电影的数字角色
- 虚拟主播:VTuber的动作就是通过动捕实时驱动数字人
- 体育分析:捕捉运动员的动作来做技术分析
- 医疗康复:捕捉患者动作来做康复评估
动作捕捉的类型
| 类型 | 原理 | 优点 | 缺点 | 成本 |
|---|---|---|---|---|
| 光学动捕 | 用多台相机追踪反光球 | 精度极高 | 需要专业设备、暗室 | 10万+ |
| 惯性动捕 | IMU传感器记录身体姿态 | 便携、可户外 | 精度一般、需穿戴 | 1-5万 |
| Markerless | AI识别视频中的人体 | 无需穿戴 | 精度较差 | 免费-低 |
| 纯AI动捕 | 用AI从单目视频估计姿态 | 最简单 | 精度最差 | 免费 |
作为个人开发者或小型团队,我们主要关注Markerless和纯AI动捕,因为其他方案成本太高。
1. MediaPipe:Google开源的AI动作捕捉
1.1 MediaPipe是什么?
MediaPipe是Google开源的跨平台机器学习框架,其中最常用的就是人体姿态估计(Pose Estimation)和手部追踪(Hand Tracking)。
它可以:
- 从摄像头实时检测人体的关键点位置
- 检测手部21个关键点
- 检测面部468个特征点
- 完全免费,而且精度还不错
1.2 安装MediaPipe
# 基础安装
pip install mediapipe
# 如果你想用GPU加速(推荐)
pip install mediapipe[gpu]
# 或者用opencv-python读取视频
pip install opencv-python1.3 MediaPipe基础使用:人体姿态检测
"""
MediaPipe人体姿态检测基础示例
"""
import cv2
import mediapipe as mp
import numpy as np
# 初始化MediaPipe的姿态检测
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
static_image_mode=False, # 视频流模式
model_complexity=2, # 模型复杂度(0/1/2,越高越准但越慢)
smooth_landmarks=True, # 平滑处理,减少抖动
enable_segmentation=False, # 不做人体分割
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
# 绘制工具
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
def detect_pose(image):
"""
检测图像中的人体姿态
Args:
image: BGR图像
Returns:
处理后的图像和关键点数据
"""
# 转换为RGB(MediaPipe需要RGB)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# 检测
results = pose.process(image_rgb)
# 绘制骨架
if results.pose_landmarks:
# 绘制关键点
mp_drawing.draw_landmarks(
image,
results.pose_landmarks,
mp_pose.POSE_CONNECTIONS, # 绘制连接线
landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
)
return image, results.pose_landmarks
# 摄像头实时检测
def webcam_demo():
"""摄像头实时姿态检测演示"""
cap = cv2.VideoCapture(0) # 0是默认摄像头
while cap.isOpened():
success, image = cap.read()
if not success:
print("无法读取摄像头")
break
# 水平翻转(自拍视角)
image = cv2.flip(image, 1)
# 检测姿态
image, landmarks = detect_pose(image)
# 打印关键点信息
if landmarks:
# 获取关键点的坐标
nose = landmarks[mp_pose.PoseLandmark.NOSE]
print(f"鼻子位置: x={nose.x:.3f}, y={nose.y:.3f}")
# 显示结果
cv2.imshow('MediaPipe Pose', image)
# 按ESC退出
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
webcam_demo()1.4 提取骨架数据并驱动数字人
"""
从MediaPipe提取骨架数据
"""
import cv2
import mediapipe as mp
import numpy as np
import json
class SkeletonExtractor:
"""
从视频/摄像头提取骨架数据
"""
# MediaPipe的33个关键点索引
LANDMARK_NAMES = [
'nose', 'left_eye_inner', 'left_eye', 'left_eye_outer',
'right_eye_inner', 'right_eye', 'right_eye_outer',
'left_ear', 'right_ear', 'mouth_left', 'mouth_right',
'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
'left_wrist', 'right_wrist', 'left_pinky', 'right_pinky',
'left_index', 'right_index', 'left_thumb', 'right_thumb',
'left_hip', 'right_hip', 'left_knee', 'right_knee',
'left_ankle', 'right_ankle', 'left_heel', 'right_heel',
'left_foot_index', 'right_foot_index'
]
def __init__(self):
self.mp_pose = mp.solutions.pose
self.pose = self.mp_pose.Pose(
static_image_mode=False,
model_complexity=2,
smooth_landmarks=True
)
# 骨骼连接定义(用于绑定到数字人)
self.bone_connections = [
# 躯干
('left_shoulder', 'right_shoulder'),
('left_shoulder', 'left_hip'),
('right_shoulder', 'right_hip'),
('left_hip', 'right_hip'),
('left_shoulder', 'left_elbow'),
('left_elbow', 'left_wrist'),
('right_shoulder', 'right_elbow'),
('right_elbow', 'right_wrist'),
('left_hip', 'left_knee'),
('left_knee', 'left_ankle'),
('right_hip', 'right_knee'),
('right_knee', 'right_ankle'),
]
def extract_from_frame(self, frame):
"""
从单帧提取骨架数据
Args:
frame: BGR图像
Returns:
骨架数据字典,如果没检测到则返回None
"""
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(image_rgb)
if not results.pose_landmarks:
return None
# 提取关键点坐标
landmarks = {}
for idx, landmark in enumerate(results.pose_landmarks.landmark):
name = self.LANDMARK_NAMES[idx]
landmarks[name] = {
'x': landmark.x,
'y': landmark.y,
'z': landmark.z,
'visibility': landmark.visibility # 置信度
}
return landmarks
def extract_from_video(self, video_path, output_path=None):
"""
从视频提取骨架数据
Args:
video_path: 视频文件路径
output_path: 可选,保存JSON的路径
Returns:
骨架数据列表
"""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = 0
skeleton_data = []
while cap.isOpened():
success, frame = cap.read()
if not success:
break
landmarks = self.extract_from_frame(frame)
if landmarks:
skeleton_data.append({
'frame': frame_count,
'timestamp': frame_count / fps,
'landmarks': landmarks
})
frame_count += 1
cap.release()
# 保存到文件
if output_path:
with open(output_path, 'w') as f:
json.dump(skeleton_data, f, indent=2)
return skeleton_data
def landmarks_to_bone_angles(self, landmarks):
"""
从关键点计算骨骼角度
Args:
landmarks: 关键点字典
Returns:
骨骼角度字典
"""
def calculate_angle(p1, p2, p3):
"""计算三个点形成的角度"""
v1 = np.array([p1['x'] - p2['x'], p1['y'] - p2['y']])
v2 = np.array([p3['x'] - p2['x'], p3['y'] - p2['y']])
cos_angle = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
angle = np.arccos(np.clip(cos_angle, -1, 1))
return np.degrees(angle)
angles = {}
# 左臂角度
if all(k in landmarks for k in ['left_shoulder', 'left_elbow', 'left_wrist']):
angles['left_elbow'] = calculate_angle(
landmarks['left_shoulder'],
landmarks['left_elbow'],
landmarks['left_wrist']
)
# 右臂角度
if all(k in landmarks for k in ['right_shoulder', 'right_elbow', 'right_wrist']):
angles['right_elbow'] = calculate_angle(
landmarks['right_shoulder'],
landmarks['right_elbow'],
landmarks['right_wrist']
)
# 左腿角度
if all(k in landmarks for k in ['left_hip', 'left_knee', 'left_ankle']):
angles['left_knee'] = calculate_angle(
landmarks['left_hip'],
landmarks['left_knee'],
landmarks['left_ankle']
)
# 右腿角度
if all(k in landmarks for k in ['right_hip', 'right_knee', 'right_ankle']):
angles['right_knee'] = calculate_angle(
landmarks['right_hip'],
landmarks['right_knee'],
landmarks['right_ankle']
)
return angles
# 使用示例
if __name__ == "__main__":
extractor = SkeletonExtractor()
# 从视频提取
# skeleton_data = extractor.extract_from_video(
# 'input_video.mp4',
# 'skeleton_data.json'
# )
# 实时处理
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, frame = cap.read()
if not success:
break
frame = cv2.flip(frame, 1)
# 提取骨架
landmarks = extractor.extract_from_frame(frame)
if landmarks:
# 计算骨骼角度
angles = extractor.landmarks_to_bone_angles(landmarks)
# 显示角度信息
angle_text = []
for name, angle in angles.items():
angle_text.append(f"{name}: {angle:.1f}°")
if angle_text:
cv2.putText(
frame,
' | '.join(angle_text[:4]), # 只显示前4个
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
(0, 255, 0),
2
)
cv2.imshow('Skeleton', frame)
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
cv2.destroyAllWindows()1.5 MediaPipe手部追踪
"""
MediaPipe手部追踪
"""
import cv2
import mediapipe as mp
import numpy as np
class HandTracker:
"""
手部追踪器
可以追踪双手的21个关键点
"""
def __init__(self, max_hands=2):
self.mp_hands = mp.solutions.hands
self.hands = self.mp_hands.Hands(
static_image_mode=False,
max_num_hands=max_hands,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
self.mp_draw = mp.solutions.drawing_utils
def get_hand_landmarks(self, frame):
"""获取手部关键点"""
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.hands.process(image_rgb)
hands_data = []
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
# 绘制
self.mp_draw.draw_landmarks(
frame,
hand_landmarks,
self.mp_hands.HAND_CONNECTIONS
)
# 提取坐标
landmarks = []
for lm in hand_landmarks.landmark:
landmarks.append({
'x': lm.x,
'y': lm.y,
'z': lm.z
})
# 判断左右手
handedness = results.multi_handedness[
results.multi_hand_landmarks.index(hand_landmarks)
].classification[0].label
hands_data.append({
'handedness': handedness,
'landmarks': landmarks
})
return frame, hands_data
def recognize_gesture(self, landmarks):
"""
识别手势
Returns:
手势名称
"""
if not landmarks:
return "None"
# 获取指尖和手掌位置
thumb_tip = landmarks[4]
thumb_ip = landmarks[3]
index_tip = landmarks[8]
index_pip = landmarks[6]
middle_tip = landmarks[12]
middle_pip = landmarks[10]
ring_tip = landmarks[16]
ring_pip = landmarks[14]
pinky_tip = landmarks[20]
pinky_pip = landmarks[18]
wrist = landmarks[0]
# 计算手指是否伸直
def is_extended(tip, pip, wrist):
"""判断手指是否伸直"""
return tip['y'] < pip['y'] # y越小越靠上
fingers_extended = {
'thumb': is_extended(thumb_tip, thumb_ip, wrist),
'index': is_extended(index_tip, index_pip, wrist),
'middle': is_extended(middle_tip, middle_pip, wrist),
'ring': is_extended(ring_tip, ring_pip, wrist),
'pinky': is_extended(pinky_tip, pinky_pip, wrist)
}
# 手势识别
fingers = list(fingers_extended.values())
if not any(fingers):
return "Fist" # 握拳
elif fingers == [True, True, True, True, True]:
return "Open" # 张开
elif fingers == [False, True, True, False, False]:
return "Peace" # 胜利
elif fingers == [True, False, False, False, False]:
return "Like" # 点赞
elif fingers == [False, True, False, False, False]:
return "Point" # 指向
elif fingers == [False, True, True, True, False]:
return "Three" # 三
else:
return "Other"
def hand_tracker_demo():
"""手部追踪演示"""
tracker = HandTracker(max_hands=2)
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, frame = cap.read()
if not success:
break
frame = cv2.flip(frame, 1)
# 获取手部数据
frame, hands = tracker.get_hand_landmarks(frame)
# 识别手势
for hand in hands:
gesture = tracker.recognize_gesture(hand['landmarks'])
# 显示手势名称
pos = hand['landmarks'][0] # 手腕位置
cv2.putText(
frame,
f"{hand['handedness']}: {gesture}",
(int(pos['x'] * frame.shape[1]), int(pos['y'] * frame.shape[0])),
cv2.FONT_HERSHEY_SIMPLEX,
1,
(0, 255, 0),
2
)
cv2.imshow('Hand Tracking', frame)
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
hand_tracker_demo()2. OpenPose:更强大的开源方案
2.1 OpenPose简介
OpenPose是卡内基梅隆大学开源的姿态检测框架,比MediaPipe更专业,但安装也更复杂。
主要功能:
- 身体关键点检测(25个点)
- 手部关键点检测(21个点)
- 面部关键点检测(70个点)
- 支持多人检测
2.2 OpenPose安装(Ubuntu/Mac)
# 安装依赖
sudo apt-get install libopencv-dev python3-opencv
# 克隆仓库
git clone https://github.com/CMU-Perceptual-Computing-Lab/openpose.git
cd openpose
# 使用Docker(推荐,更简单)
docker pull g CEE(openpose:latest)
# 或者用Python版本(功能较少但安装简单)
pip install openpifpaf2.3 OpenPose快速使用
# 摄像头实时检测
./build/examples/openpose/openpose.bin --camera 0
# 从视频文件检测
./build/examples/openpose/openpose.bin --video input_video.mp4
# 保存结果
./build/examples/openpose/openpose.bin \
--video input_video.mp4 \
--write_json output_json/ \
--write_video output_video.avi3. 表情捕捉:让数字人有表情
3.1 表情捕捉是什么?
光有身体动作还不够,数字人还要有表情才能”活”起来。表情捕捉就是检测人的面部表情,然后驱动数字人的表情系统。
3.2 ARKit表情捕捉(iOS设备)
如果你是iPhone或iPad用户,可以使用ARKit自带的Face Tracking功能:
import ARKit
import RealityKit
class FaceTracker: NSObject, ARSessionDelegate {
var session: ARSession!
func setupFaceTracking() {
guard ARFaceTrackingConfiguration.isSupported else {
print("设备不支持面部追踪")
return
}
let configuration = ARFaceTrackingConfiguration()
configuration.isLightEstimationEnabled = true
session.run(configuration)
}
func session(_ session: ARSession, didUpdate anchors: [ARAnchor]) {
guard let faceAnchor = anchors.first as? ARFaceAnchor else { return }
// 获取表情系数(52个blend shapes)
let blendShapes = faceAnchor.blendShapes
// 提取关键表情
let eyeBlinkLeft = blendShapes[.eyeBlinkLeft]?.doubleValue ?? 0
let eyeBlinkRight = blendShapes[.eyeBlinkRight]?.doubleValue ?? 0
let mouthSmileLeft = blendShapes[.mouthSmileLeft]?.doubleValue ?? 0
let mouthSmileRight = blendShapes[.mouthSmileRight]?.doubleValue ?? 0
let jawOpen = blendShapes[.jawOpen]?.doubleValue ?? 0
let browDownLeft = blendShapes[.browDownLeft]?.doubleValue ?? 0
let browDownRight = blendShapes[.browDownRight]?.doubleValue ?? 0
// 打印数据
print("左眼眨眼: \(eyeBlinkLeft)")
print("右眼眨眼: \(eyeBlinkRight)")
print("嘴巴张开: \(jawOpen)")
print("微笑: \((mouthSmileLeft + mouthSmileRight) / 2)")
// 这里可以把数据发送给数字人系统
// sendToDigitalHuman(...)
}
}3.3 Python端接收ARKit数据
"""
使用Live Link Face接收iPhone的面部追踪数据
"""
import socket
import json
import numpy as np
class LiveLinkFaceReceiver:
"""
通过UDP接收Live Link Face的数据
"""
def __init__(self, port=11111):
self.port = port
self.socket = None
self.running = False
# ARKit Blend Shape索引映射到Live Link
self.blend_shape_names = [
'browDown_L', 'browDown_R', 'browInnerUp',
'browOuterUp_L', 'browOuterUp_R',
'eyeLookUp_L', 'eyeLookUp_R',
'eyeLookDown_L', 'eyeLookDown_R',
'eyeLookIn_L', 'eyeLookIn_R',
'eyeLookOut_L', 'eyeLookOut_R',
'eyeBlink_L', 'eyeBlink_R',
'eyeSquint_L', 'eyeSquint_R',
'eyeWide_L', 'eyeWide_R',
'cheekPuff', 'cheekSquint_L', 'cheekSquint_R',
'noseSneer_L', 'noseSneer_R',
'jawOpen', 'jawForward', 'jawLeft', 'jawRight',
'jawClench',
'mouthFunnel', 'mouthPucker',
'mouthLeft', 'mouthRight',
'mouthSmile_L', 'mouthSmile_R',
'mouthFrown_L', 'mouthFrown_R',
'mouthDimple_L', 'mouthDimple_R',
'mouthUpperUp_L', 'mouthUpperUp_R',
'mouthLowerDown_L', 'mouthLowerDown_R',
'mouthPress_L', 'mouthPress_R',
'mouthStretch_L', 'mouthStretch_R',
'tongueOut'
]
def start(self):
"""开始接收"""
self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self.socket.bind(('', self.port))
self.socket.settimeout(1.0) # 1秒超时
self.running = True
print(f"监听端口 {self.port}...")
def receive(self):
"""
接收一帧数据
Returns:
blend shape值字典,如果超时返回None
"""
if not self.running:
return None
try:
data, _ = self.socket.recvfrom(4096)
message = json.loads(data.decode('utf-8'))
# 解析blend shapes
blend_shapes = {}
if 'blendShapes' in message:
for i, bs in enumerate(message['blendShapes']):
name = self.blend_shape_names[i] if i < len(self.blend_shape_names) else f"bs_{i}"
blend_shapes[name] = bs
return blend_shapes
except socket.timeout:
return None
except Exception as e:
print(f"接收错误: {e}")
return None
def stop(self):
"""停止接收"""
self.running = False
if self.socket:
self.socket.close()
# 使用示例
if __name__ == "__main__":
receiver = LiveLinkFaceReceiver()
receiver.start()
try:
while True:
blend_shapes = receiver.receive()
if blend_shapes:
print(f"眨眼: {blend_shapes.get('eyeBlink_L', 0):.2f}, {blend_shapes.get('eyeBlink_R', 0):.2f}")
print(f"微笑: {blend_shapes.get('mouthSmile_L', 0):.2f}")
print(f"张嘴: {blend_shapes.get('jawOpen', 0):.2f}")
except KeyboardInterrupt:
print("\n停止")
finally:
receiver.stop()3.4 MediaPipe面部表情检测
"""
使用MediaPipe进行面部表情检测
"""
import cv2
import mediapipe as mp
import numpy as np
class FaceExpressionDetector:
"""
面部表情检测器
"""
def __init__(self):
self.mp_face_mesh = mp.solutions.face_mesh
self.face_mesh = self.mp_face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=1,
refine_landmarks=True, # 启用眼部细节
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
# 眼睛和嘴巴的关键点索引
self.LEFT_EYE = [362, 385, 387, 263, 373, 380]
self.RIGHT_EYE = [33, 160, 158, 133, 153, 144]
self.MOUTH_OUTER = [61, 291, 0, 17, 269, 405]
self.MOUTH_INNER = [78, 95, 88, 178, 87, 14, 317, 402]
self.LEFT_BROW = [70, 63, 105, 66, 107]
self.RIGHT_BROW = [336, 296, 334, 293, 300]
def calculate_ear(self, landmarks, eye_indices):
"""
计算眼睛纵横比(Eye Aspect Ratio)
用于检测眨眼
"""
def distance(p1, p2):
return np.sqrt((p1.x - p2.x)**2 + (p1.y - p2.y)**2)
# 计算眼睛的垂直距离
# 上眼睑到下眼睑
top = (landmarks[eye_indices[1]].y + landmarks[eye_indices[2]].y) / 2
bottom = (landmarks[eye_indices[4]].y + landmarks[eye_indices[5]].y) / 2
# 眼睑宽度
width = distance(landmarks[eye_indices[0]], landmarks[eye_indices[3]])
ear = (top - bottom) / width if width > 0 else 0
return ear
def calculate_mar(self, landmarks):
"""
计算嘴巴纵横比(Mouth Aspect Ratio)
用于检测张嘴
"""
def distance(p1, p2):
return np.sqrt((p1.x - p2.x)**2 + (p1.y - p2.y)**2)
# 嘴巴高度
top = landmarks[13].y # 上唇中点
bottom = landmarks[14].y # 下唇中点
height = abs(top - bottom)
# 嘴巴宽度
left = landmarks[61].x
right = landmarks[291].x
width = abs(right - left)
mar = height / width if width > 0 else 0
return mar
def detect(self, frame):
"""
检测面部表情
Returns:
表情数据字典
"""
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.face_mesh.process(image_rgb)
expression_data = {
'blink_left': 0,
'blink_right': 0,
'mouth_open': 0,
'smile': 0,
'brow_raise': 0
}
if results.multi_face_landmarks:
landmarks = results.multi_face_landmarks[0].landmark
# 计算EAR(眼睛纵横比)
ear_left = self.calculate_ear(landmarks, self.LEFT_EYE)
ear_right = self.calculate_ear(landmarks, self.RIGHT_EYE)
# EAR小于阈值表示眨眼
BLINK_THRESHOLD = 0.2
expression_data['blink_left'] = 1.0 if ear_left < BLINK_THRESHOLD else 0.0
expression_data['blink_right'] = 1.0 if ear_right < BLINK_THRESHOLD else 0.0
# 计算MAR(嘴巴纵横比)
mar = self.calculate_mar(landmarks)
expression_data['mouth_open'] = mar * 2 # 归一化
# 计算微笑(嘴角上扬程度)
mouth_left_y = landmarks[61].y
mouth_right_y = landmarks[291].y
# 微笑时嘴角会上扬
avg_mouth_y = (mouth_left_y + mouth_right_y) / 2
# 基准线(静止时的位置,需要校准)
baseline = 0.58
expression_data['smile'] = max(0, (baseline - avg_mouth_y) * 5)
# 眉毛上扬
brow_left_y = (landmarks[70].y + landmarks[63].y) / 2
brow_right_y = (landmarks[336].y + landmarks[296].y) / 2
brow_avg = (brow_left_y + brow_right_y) / 2
expression_data['brow_raise'] = max(0, (0.44 - brow_avg) * 10)
return expression_data
def face_expression_demo():
"""面部表情检测演示"""
detector = FaceExpressionDetector()
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, frame = cap.read()
if not success:
break
frame = cv2.flip(frame, 1)
# 检测表情
expression = detector.detect(frame)
# 显示表情数据
texts = [
f"眨眼(L): {expression['blink_left']:.0f}",
f"眨眼(R): {expression['blink_right']:.0f}",
f"张嘴: {expression['mouth_open']:.2f}",
f"微笑: {expression['smile']:.2f}",
f"挑眉: {expression['brow_raise']:.2f}"
]
for i, text in enumerate(texts):
cv2.putText(
frame, text, (10, 30 + i * 25),
cv2.FONT_HERSHEY_SIMPLEX,
0.6, (0, 255, 0), 2
)
cv2.imshow('Face Expression', frame)
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
face_expression_demo()4. 骨骼动画基础:理解数字人是如何动起来的
4.1 骨骼系统是什么?
数字人的”骨骼”跟真实人的骨骼类似,是一个层级结构:
- 髋关节是根节点
- 脊柱连接躯干
- 肩膀连接手臂
- 膝盖连接小腿
每个骨骼都有位置和旋转属性,通过改变这些属性,就能让数字人做出各种动作。
头部(head)
↑
颈部(neck)
↑
胸部(chest)
↙ ↘
左肩 右肩
↘ ↙
左臂 右臂
↘ ↙
前臂 前臂
↘ ↙
手腕 手腕
↑
手
4.2 骨骼动画的核心概念
FK(正向运动学):
- 从根节点开始,逐级往下控制
- 适合固定动作的录制
IK(逆运动学):
- 指定末端(比如手)的位置,自动计算中间关节的角度
- 适合实时交互场景
"""
简单的IK(逆运动学)实现
"""
import numpy as np
import math
class SimpleIK:
"""
2D逆运动学求解器(FABRIK算法)
"""
def __init__(self, joints, bone_lengths):
"""
Args:
joints: 关节位置列表 [[x,y], ...]
bone_lengths: 各段长度列表 [len1, len2, ...]
"""
self.joints = np.array(joints, dtype=float)
self.bone_lengths = np.array(bone_lengths)
self.total_length = sum(bone_lengths)
def solve(self, target, max_iterations=10, tolerance=0.01):
"""
求解IK
Args:
target: 目标位置 [x, y]
max_iterations: 最大迭代次数
tolerance: 收敛阈值
Returns:
求解后的关节位置
"""
target = np.array(target)
# 保存根节点位置
root = self.joints[0].copy()
# 检查目标是否可达
dist_to_target = np.linalg.norm(target - root)
if dist_to_target > self.total_length:
# 目标不可达,向目标方向伸展
self.stretch_towards(target)
else:
# FABRIK迭代
for _ in range(max_iterations):
# 后向递推:把末端拉向目标
self.joints[-1] = target
for i in range(len(self.joints) - 2, -1, -1):
direction = self.joints[i + 1] - self.joints[i]
direction = direction / np.linalg.norm(direction)
self.joints[i] = (
self.joints[i + 1] -
direction * self.bone_lengths[i]
)
# 前向递推:把根节点移回原位
self.joints[0] = root
for i in range(len(self.joints) - 1):
direction = self.joints[i + 1] - self.joints[i]
direction = direction / np.linalg.norm(direction)
self.joints[i + 1] = (
self.joints[i] +
direction * self.bone_lengths[i]
)
# 检查收敛
if np.linalg.norm(self.joints[-1] - target) < tolerance:
break
return self.joints.tolist()
def stretch_towards(self, target):
"""目标不可达时,向目标方向伸展"""
self.joints[0] = target
for i in range(len(self.joints) - 1):
direction = self.joints[i] - self.joints[i + 1]
direction = direction / np.linalg.norm(direction)
self.joints[i + 1] = (
self.joints[i] -
direction * self.bone_lengths[i]
)
self.joints = self.joints[::-1]
# 使用示例
if __name__ == "__main__":
# 定义一个3段的手臂
joints = [[0, 0], [0, 50], [0, 100], [0, 150]] # 4个关节
bone_lengths = [50, 50, 50] # 3段,每段50单位长
ik = SimpleIK(joints, bone_lengths)
# 设置目标位置(手臂末端要去的地方)
target = [30, 140]
# 求解
result = ik.solve(target)
print("原始位置:", joints)
print("求解后:", result)
print("目标:", target)5. 将动捕数据应用到数字人
5.1 数据格式转换
动捕系统输出的数据格式各异,需要转换成数字人能用的格式。常见的格式有:
| 格式 | 说明 | 适用软件 |
|---|---|---|
| BVH | Biovision Hierarchy,纯文本 | 通用 |
| FBX | Autodesk格式,二进制 | Unity、UE、Maya |
| SKEL | Unity骨骼格式 | Unity |
| JSON | 自定义格式 | Web应用 |
"""
将MediaPipe数据转换为BVH格式
"""
import json
class BVHExporter:
"""
BVH格式导出器
"""
# BVH骨骼层级定义
HIERARCHY = """
HIERARCHY
ROOT hip
{
OFFSET 0.00 85.00 0.00
CHANNELS 6 Xposition Yposition Zposition Zrotation Yrotation Xrotation
JOINT spine
{
OFFSET 0.00 10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT chest
{
OFFSET 0.00 20.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT neck
{
OFFSET 0.00 15.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT head
{
OFFSET 0.00 10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
}
}
JOINT left_shoulder
{
OFFSET 15.00 10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT left_elbow
{
OFFSET 30.00 0.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT left_wrist
{
OFFSET 25.00 0.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
}
}
}
JOINT right_shoulder
{
OFFSET -15.00 10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT right_elbow
{
OFFSET -30.00 0.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT right_wrist
{
OFFSET -25.00 0.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
}
}
}
}
}
JOINT left_hip
{
OFFSET 10.00 -10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT left_knee
{
OFFSET 0.00 -40.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT left_ankle
{
OFFSET 0.00 -40.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
}
}
}
JOINT right_hip
{
OFFSET -10.00 -10.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT right_knee
{
OFFSET 0.00 -40.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
JOINT right_ankle
{
OFFSET 0.00 -40.00 0.00
CHANNELS 3 Zrotation Yrotation Xrotation
}
}
}
}
"""
def __init__(self):
self.frame_time = 1 / 30 # 30fps
def export(self, skeleton_data, output_path):
"""
导出为BVH文件
Args:
skeleton_data: 骨架数据列表
output_path: 输出文件路径
"""
with open(output_path, 'w') as f:
# 写入骨骼层级
f.write(self.HIERARCHY)
f.write("\n")
# 写入MOTION部分
f.write("MOTION\n")
f.write(f"Frames: {len(skeleton_data)}\n")
f.write(f"Frame Time: {self.frame_time}\n")
# 写入每一帧的数据
for frame_data in skeleton_data:
motion_values = self.skeleton_to_motion(frame_data)
f.write(" ".join([f"{v:.4f}" for v in motion_values]) + "\n")
def skeleton_to_motion(self, frame_data):
"""
将骨架数据转换为运动数据
"""
landmarks = frame_data.get('landmarks', {})
# 这里需要根据实际骨骼映射关系来转换
# 这是一个简化版本
values = []
# 髋关节(根节点)
if 'left_hip' in landmarks and 'right_hip' in landmarks:
hip_x = (landmarks['left_hip']['x'] + landmarks['right_hip']['x']) / 2
hip_y = (landmarks['left_hip']['y'] + landmarks['right_hip']['y']) / 2
hip_z = (landmarks['left_hip'].get('z', 0) + landmarks['right_hip'].get('z', 0)) / 2
# 转换为骨骼旋转(简化处理)
values.extend([hip_x * 100, hip_y * 100, hip_z * 100, 0, 0, 0])
# 其他关节的旋转(简化处理)
# 实际应用中需要更复杂的计算
for _ in range(60): # 简化:填充60个旋转值
values.append(0)
return values[:66] # 确保有66个值(BVH标准)
# 使用示例
if __name__ == "__main__":
# 假设你已经有了skeleton_data
# skeleton_data = extractor.extract_from_video('input.mp4')
exporter = BVHExporter()
# exporter.export(skeleton_data, 'output.bvh')
print("BVH导出器就绪")5.2 Unity中应用动捕数据
// Unity中接收外部骨架数据并驱动角色
using UnityEngine;
using System.Collections.Generic;
public class SkeletonDriver : MonoBehaviour
{
// 骨骼映射
public Transform hip;
public Transform spine;
public Transform chest;
public Transform neck;
public Transform head;
public Transform leftShoulder;
public Transform leftElbow;
public Transform leftWrist;
public Transform rightShoulder;
public Transform rightElbow;
public Transform rightWrist;
// 动画曲线平滑
private Dictionary<string, float> smoothValues = new Dictionary<string, float>();
void Start()
{
// 初始化平滑值
smoothValues["hip_x"] = 0;
smoothValues["hip_y"] = 0;
smoothValues["spine_rot"] = 0;
// ...
}
public void ApplySkeletonData(string jsonData)
{
// 解析JSON数据
var data = JsonUtility.FromJson<SkeletonData>(jsonData);
if (data == null) return;
// 应用到骨骼
ApplyRotation(leftElbow, data.leftElbowAngle, "left_elbow");
ApplyRotation(rightElbow, data.rightElbowAngle, "right_elbow");
ApplyRotation(neck, data.neckAngle, "neck");
ApplyRotation(head, data.headAngle, "head");
// 位置处理
ApplyPosition(hip, data.hipPosition);
}
void ApplyRotation(Transform bone, Vector3 rotation, string key)
{
if (bone == null) return;
// 平滑过渡
if (!smoothValues.ContainsKey(key))
smoothValues[key] = 0;
float target = rotation.x;
smoothValues[key] = Mathf.Lerp(smoothValues[key], target, 0.3f);
bone.localEulerAngles = new Vector3(
smoothValues[key],
bone.localEulerAngles.y,
bone.localEulerAngles.z
);
}
void ApplyPosition(Transform bone, Vector3 position)
{
if (bone == null) return;
// 平滑移动
bone.localPosition = Vector3.Lerp(
bone.localPosition,
position * 100, // 缩放
0.3f
);
}
}
[System.Serializable]
public class SkeletonData
{
public Vector3 hipPosition;
public Vector3 leftElbowAngle;
public Vector3 rightElbowAngle;
public Vector3 neckAngle;
public Vector3 headAngle;
}6. 实时动作捕捉系统搭建
6.1 系统架构
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ 摄像头 │────▶│ MediaPipe │────▶│ 骨架数据 │
│ (RGB/D415) │ │ 姿态检测 │ │ (JSON) │
└─────────────┘ └─────────────┘ └──────┬──────┘
│
┌─────────────┐ ┌────────▼────────┐
│ WebSocket │────▶│ 数字人渲染 │
│ 服务器 │ │ (Unity/UE/Web) │
└─────────────┘ └─────────────────┘
6.2 完整代码示例
"""
实时动作捕捉系统
摄像头 → MediaPipe → WebSocket → Unity/WebGL
"""
import cv2
import mediapipe as mp
import asyncio
import websockets
import json
import threading
import numpy as np
class RealTimeMoCap:
"""
实时动作捕捉系统
"""
def __init__(self, websocket_url=None):
self.mp_pose = mp.solutions.pose
self.pose = self.mp_pose.Pose(
static_image_mode=False,
model_complexity=2,
smooth_landmarks=True
)
self.mp_hands = mp.solutions.hands
self.hands = self.mp_hands.Hands(
static_image_mode=False,
max_num_hands=2,
model_complexity=1
)
self.websocket_url = websocket_url
self.ws = None
self.running = False
self.smoothing = 0.7 # 平滑系数
self.prev_landmarks = None
# 映射到标准骨骼
self.skeleton_mapping = {
'hips': 23, # 左髋
'spine': 24, # 左肩
'chest': 12, # 右肩
'neck': 11, # 颈部
'head': 0, # 鼻子
'left_shoulder': 11,
'left_elbow': 13,
'left_wrist': 15,
'right_shoulder': 12,
'right_elbow': 14,
'right_wrist': 16,
'left_hip': 23,
'left_knee': 25,
'left_ankle': 27,
'right_hip': 24,
'right_knee': 26,
'right_ankle': 28,
}
async def connect_websocket(self):
"""连接到WebSocket服务器"""
if self.websocket_url:
self.ws = await websockets.connect(self.websocket_url)
print(f"已连接到 {self.websocket_url}")
async def send_data(self, data):
"""发送数据到服务器"""
if self.ws:
await self.ws.send(json.dumps(data))
def process_frame(self, frame):
"""处理单帧"""
image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 姿态检测
pose_results = self.pose.process(image_rgb)
# 手部检测
hand_results = self.hands.process(image_rgb)
skeleton_data = None
if pose_results.pose_landmarks:
# 提取骨架数据
skeleton_data = self.extract_skeleton(pose_results.pose_landmarks)
# 平滑处理
skeleton_data = self.smooth_skeleton(skeleton_data)
# 绘制骨架
self.draw_skeleton(frame, pose_results.pose_landmarks)
# 绘制手部
if hand_results.multi_hand_landmarks:
for hand_landmarks in hand_results.multi_hand_landmarks:
self.mp_drawing.draw_landmarks(
frame,
hand_landmarks,
self.mp_hands.HAND_CONNECTIONS
)
return frame, skeleton_data
def extract_skeleton(self, landmarks):
"""提取骨架数据"""
data = {}
for name, idx in self.skeleton_mapping.items():
landmark = landmarks[idx]
data[name] = {
'x': landmark.x,
'y': landmark.y,
'z': landmark.z,
'visibility': landmark.visibility
}
return data
def smooth_skeleton(self, current):
"""骨架数据平滑"""
if self.prev_landmarks is None:
self.prev_landmarks = current
return current
smoothed = {}
for key in current.keys():
if key in self.prev_landmarks:
prev = self.prev_landmarks[key]
curr = current[key]
smoothed[key] = {
'x': prev['x'] * self.smoothing + curr['x'] * (1 - self.smoothing),
'y': prev['y'] * self.smoothing + curr['y'] * (1 - self.smoothing),
'z': prev['z'] * self.smoothing + curr['z'] * (1 - self.smoothing),
'visibility': curr['visibility']
}
else:
smoothed[key] = current[key]
self.prev_landmarks = smoothed
return smoothed
def draw_skeleton(self, frame, landmarks):
"""绘制骨架"""
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose
# 绘制连接线
for connection in mp_pose.POSE_CONNECTIONS:
start_idx, end_idx = connection
start = landmarks[start_idx]
end = landmarks[end_idx]
h, w, _ = frame.shape
start_point = (int(start.x * w), int(start.y * h))
end_point = (int(end.x * w), int(end.y * h))
cv2.line(frame, start_point, end_point, (0, 255, 0), 2)
# 绘制关键点
for landmark in landmarks:
h, w, _ = frame.shape
point = (int(landmark.x * w), int(landmark.y * h))
cv2.circle(frame, point, 4, (0, 0, 255), -1)
async def run(self):
"""运行主循环"""
cap = cv2.VideoCapture(0)
self.running = True
if self.websocket_url:
await self.connect_websocket()
while self.running and cap.isOpened():
success, frame = cap.read()
if not success:
break
# 水平翻转
frame = cv2.flip(frame, 1)
# 处理帧
frame, skeleton_data = self.process_frame(frame)
# 显示
cv2.imshow('Real-Time MoCap', frame)
# 发送数据
if skeleton_data:
await self.send_data({
'type': 'skeleton',
'data': skeleton_data
})
# 按ESC退出
if cv2.waitKey(5) & 0xFF == 27:
self.running = False
break
cap.release()
cv2.destroyAllWindows()
if self.ws:
await self.ws.close()
def stop(self):
"""停止"""
self.running = False
async def main():
# 创建动捕系统(连接到本地Unity)
mocap = RealTimeMoCap(websocket_url="ws://localhost:8765")
await mocap.run()
if __name__ == "__main__":
asyncio.run(main())7. 常见问题与解决方案
Q1: 摄像头检测不准确怎么办?
- 光线问题:确保环境光线均匀,不要背光
- 距离问题:保持适当距离(1.5-3米最佳)
- 遮挡问题:尽量让全身都在画面中
- 参数调整:增大
min_detection_confidence
Q2: 动作抖动怎么解决?
- 开启平滑:
smooth_landmarks=True - 增加后处理:使用移动平均或卡尔曼滤波
- 降低帧率:从60fps降到30fps
Q3: 延迟太高怎么办?
- 降低模型复杂度:
model_complexity=1代替2 - 减小输入分辨率:resize到640x480
- 使用GPU:如果支持的话
Q4: 怎么让数字人动作更自然?
- 动作混合:不要直接替换,渐变过渡
- 添加待机动画:静止时也要有呼吸等微动
- 随机微动作:偶尔的眨眼、头部微动
相关文档
更新日志
| 日期 | 版本 | 修改内容 |
|---|---|---|
| 2026-04-18 | v1.0 | 初版完成 |
| 2026-04-24 | v1.1 | 深度改写,增加详细实操代码 |
版权声明
本文档为归愚知识库原创内容,采用CC BY-NC-SA 4.0协议授权。