动作捕捉技术:让数字人动起来的核心技能

这篇文章帮你理解动作捕捉

数字人除了要有好看的脸,还要会动才行。动作捕捉就是把你的动作”复制”到数字人身上的技术。读完这篇,你会明白动作捕捉是怎么工作的,以及怎么用各种工具实现从摄像头捕捉动作来驱动数字人。

先搞清楚:什么是动作捕捉?

你有没有看过那种很炫酷的幕后花絮?演员穿着满身是球的紧身衣,在绿幕前做各种动作。然后后期人员把这些动作”绑定”到电脑上制作出的角色身上,让角色做出跟演员一样的动作。这就是动作捕捉(Motion Capture,简称MoCap)。

简单来说,动作捕捉就是:记录真人的动作,然后把这些动作”转移”到数字人身上

动作捕捉有什么用?

  1. 游戏动画:你看到游戏角色的动作,很多都是真人动捕后绑定的
  2. 电影特效:阿凡达、猩球崛起等电影的数字角色
  3. 虚拟主播:VTuber的动作就是通过动捕实时驱动数字人
  4. 体育分析:捕捉运动员的动作来做技术分析
  5. 医疗康复:捕捉患者动作来做康复评估

动作捕捉的类型

类型原理优点缺点成本
光学动捕用多台相机追踪反光球精度极高需要专业设备、暗室10万+
惯性动捕IMU传感器记录身体姿态便携、可户外精度一般、需穿戴1-5万
MarkerlessAI识别视频中的人体无需穿戴精度较差免费-低
纯AI动捕用AI从单目视频估计姿态最简单精度最差免费

作为个人开发者或小型团队,我们主要关注Markerless纯AI动捕,因为其他方案成本太高。


1. MediaPipe:Google开源的AI动作捕捉

1.1 MediaPipe是什么?

MediaPipe是Google开源的跨平台机器学习框架,其中最常用的就是人体姿态估计(Pose Estimation)和手部追踪(Hand Tracking)。

它可以:

  • 从摄像头实时检测人体的关键点位置
  • 检测手部21个关键点
  • 检测面部468个特征点
  • 完全免费,而且精度还不错

1.2 安装MediaPipe

# 基础安装
pip install mediapipe
 
# 如果你想用GPU加速(推荐)
pip install mediapipe[gpu]
 
# 或者用opencv-python读取视频
pip install opencv-python

1.3 MediaPipe基础使用:人体姿态检测

"""
MediaPipe人体姿态检测基础示例
"""
import cv2
import mediapipe as mp
import numpy as np
 
# 初始化MediaPipe的姿态检测
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
    static_image_mode=False,    # 视频流模式
    model_complexity=2,       # 模型复杂度(0/1/2,越高越准但越慢)
    smooth_landmarks=True,     # 平滑处理,减少抖动
    enable_segmentation=False, # 不做人体分割
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
 
# 绘制工具
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
 
def detect_pose(image):
    """
    检测图像中的人体姿态
    
    Args:
        image: BGR图像
    
    Returns:
        处理后的图像和关键点数据
    """
    # 转换为RGB(MediaPipe需要RGB)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # 检测
    results = pose.process(image_rgb)
    
    # 绘制骨架
    if results.pose_landmarks:
        # 绘制关键点
        mp_drawing.draw_landmarks(
            image,
            results.pose_landmarks,
            mp_pose.POSE_CONNECTIONS,  # 绘制连接线
            landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
        )
    
    return image, results.pose_landmarks
 
# 摄像头实时检测
def webcam_demo():
    """摄像头实时姿态检测演示"""
    cap = cv2.VideoCapture(0)  # 0是默认摄像头
    
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("无法读取摄像头")
            break
        
        # 水平翻转(自拍视角)
        image = cv2.flip(image, 1)
        
        # 检测姿态
        image, landmarks = detect_pose(image)
        
        # 打印关键点信息
        if landmarks:
            # 获取关键点的坐标
            nose = landmarks[mp_pose.PoseLandmark.NOSE]
            print(f"鼻子位置: x={nose.x:.3f}, y={nose.y:.3f}")
        
        # 显示结果
        cv2.imshow('MediaPipe Pose', image)
        
        # 按ESC退出
        if cv2.waitKey(5) & 0xFF == 27:
            break
    
    cap.release()
    cv2.destroyAllWindows()
 
if __name__ == "__main__":
    webcam_demo()

1.4 提取骨架数据并驱动数字人

"""
从MediaPipe提取骨架数据
"""
import cv2
import mediapipe as mp
import numpy as np
import json
 
class SkeletonExtractor:
    """
    从视频/摄像头提取骨架数据
    """
    
    # MediaPipe的33个关键点索引
    LANDMARK_NAMES = [
        'nose', 'left_eye_inner', 'left_eye', 'left_eye_outer',
        'right_eye_inner', 'right_eye', 'right_eye_outer',
        'left_ear', 'right_ear', 'mouth_left', 'mouth_right',
        'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
        'left_wrist', 'right_wrist', 'left_pinky', 'right_pinky',
        'left_index', 'right_index', 'left_thumb', 'right_thumb',
        'left_hip', 'right_hip', 'left_knee', 'right_knee',
        'left_ankle', 'right_ankle', 'left_heel', 'right_heel',
        'left_foot_index', 'right_foot_index'
    ]
    
    def __init__(self):
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True
        )
        
        # 骨骼连接定义(用于绑定到数字人)
        self.bone_connections = [
            # 躯干
            ('left_shoulder', 'right_shoulder'),
            ('left_shoulder', 'left_hip'),
            ('right_shoulder', 'right_hip'),
            ('left_hip', 'right_hip'),
            ('left_shoulder', 'left_elbow'),
            ('left_elbow', 'left_wrist'),
            ('right_shoulder', 'right_elbow'),
            ('right_elbow', 'right_wrist'),
            ('left_hip', 'left_knee'),
            ('left_knee', 'left_ankle'),
            ('right_hip', 'right_knee'),
            ('right_knee', 'right_ankle'),
        ]
    
    def extract_from_frame(self, frame):
        """
        从单帧提取骨架数据
        
        Args:
            frame: BGR图像
        
        Returns:
            骨架数据字典,如果没检测到则返回None
        """
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = self.pose.process(image_rgb)
        
        if not results.pose_landmarks:
            return None
        
        # 提取关键点坐标
        landmarks = {}
        for idx, landmark in enumerate(results.pose_landmarks.landmark):
            name = self.LANDMARK_NAMES[idx]
            landmarks[name] = {
                'x': landmark.x,
                'y': landmark.y,
                'z': landmark.z,
                'visibility': landmark.visibility  # 置信度
            }
        
        return landmarks
    
    def extract_from_video(self, video_path, output_path=None):
        """
        从视频提取骨架数据
        
        Args:
            video_path: 视频文件路径
            output_path: 可选,保存JSON的路径
        
        Returns:
            骨架数据列表
        """
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = 0
        skeleton_data = []
        
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                break
            
            landmarks = self.extract_from_frame(frame)
            if landmarks:
                skeleton_data.append({
                    'frame': frame_count,
                    'timestamp': frame_count / fps,
                    'landmarks': landmarks
                })
            
            frame_count += 1
        
        cap.release()
        
        # 保存到文件
        if output_path:
            with open(output_path, 'w') as f:
                json.dump(skeleton_data, f, indent=2)
        
        return skeleton_data
    
    def landmarks_to_bone_angles(self, landmarks):
        """
        从关键点计算骨骼角度
        
        Args:
            landmarks: 关键点字典
        
        Returns:
            骨骼角度字典
        """
        def calculate_angle(p1, p2, p3):
            """计算三个点形成的角度"""
            v1 = np.array([p1['x'] - p2['x'], p1['y'] - p2['y']])
            v2 = np.array([p3['x'] - p2['x'], p3['y'] - p2['y']])
            
            cos_angle = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            angle = np.arccos(np.clip(cos_angle, -1, 1))
            return np.degrees(angle)
        
        angles = {}
        
        # 左臂角度
        if all(k in landmarks for k in ['left_shoulder', 'left_elbow', 'left_wrist']):
            angles['left_elbow'] = calculate_angle(
                landmarks['left_shoulder'],
                landmarks['left_elbow'],
                landmarks['left_wrist']
            )
        
        # 右臂角度
        if all(k in landmarks for k in ['right_shoulder', 'right_elbow', 'right_wrist']):
            angles['right_elbow'] = calculate_angle(
                landmarks['right_shoulder'],
                landmarks['right_elbow'],
                landmarks['right_wrist']
            )
        
        # 左腿角度
        if all(k in landmarks for k in ['left_hip', 'left_knee', 'left_ankle']):
            angles['left_knee'] = calculate_angle(
                landmarks['left_hip'],
                landmarks['left_knee'],
                landmarks['left_ankle']
            )
        
        # 右腿角度
        if all(k in landmarks for k in ['right_hip', 'right_knee', 'right_ankle']):
            angles['right_knee'] = calculate_angle(
                landmarks['right_hip'],
                landmarks['right_knee'],
                landmarks['right_ankle']
            )
        
        return angles
 
# 使用示例
if __name__ == "__main__":
    extractor = SkeletonExtractor()
    
    # 从视频提取
    # skeleton_data = extractor.extract_from_video(
    #     'input_video.mp4',
    #     'skeleton_data.json'
    # )
    
    # 实时处理
    cap = cv2.VideoCapture(0)
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break
        
        frame = cv2.flip(frame, 1)
        
        # 提取骨架
        landmarks = extractor.extract_from_frame(frame)
        
        if landmarks:
            # 计算骨骼角度
            angles = extractor.landmarks_to_bone_angles(landmarks)
            
            # 显示角度信息
            angle_text = []
            for name, angle in angles.items():
                angle_text.append(f"{name}: {angle:.1f}°")
            
            if angle_text:
                cv2.putText(
                    frame,
                    ' | '.join(angle_text[:4]),  # 只显示前4个
                    (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.7,
                    (0, 255, 0),
                    2
                )
        
        cv2.imshow('Skeleton', frame)
        
        if cv2.waitKey(5) & 0xFF == 27:
            break
    
    cap.release()
    cv2.destroyAllWindows()

1.5 MediaPipe手部追踪

"""
MediaPipe手部追踪
"""
import cv2
import mediapipe as mp
import numpy as np
 
class HandTracker:
    """
    手部追踪器
    可以追踪双手的21个关键点
    """
    
    def __init__(self, max_hands=2):
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=max_hands,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.mp_draw = mp.solutions.drawing_utils
    
    def get_hand_landmarks(self, frame):
        """获取手部关键点"""
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = self.hands.process(image_rgb)
        
        hands_data = []
        
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # 绘制
                self.mp_draw.draw_landmarks(
                    frame,
                    hand_landmarks,
                    self.mp_hands.HAND_CONNECTIONS
                )
                
                # 提取坐标
                landmarks = []
                for lm in hand_landmarks.landmark:
                    landmarks.append({
                        'x': lm.x,
                        'y': lm.y,
                        'z': lm.z
                    })
                
                # 判断左右手
                handedness = results.multi_handedness[
                    results.multi_hand_landmarks.index(hand_landmarks)
                ].classification[0].label
                
                hands_data.append({
                    'handedness': handedness,
                    'landmarks': landmarks
                })
        
        return frame, hands_data
    
    def recognize_gesture(self, landmarks):
        """
        识别手势
        
        Returns:
            手势名称
        """
        if not landmarks:
            return "None"
        
        # 获取指尖和手掌位置
        thumb_tip = landmarks[4]
        thumb_ip = landmarks[3]
        index_tip = landmarks[8]
        index_pip = landmarks[6]
        middle_tip = landmarks[12]
        middle_pip = landmarks[10]
        ring_tip = landmarks[16]
        ring_pip = landmarks[14]
        pinky_tip = landmarks[20]
        pinky_pip = landmarks[18]
        wrist = landmarks[0]
        
        # 计算手指是否伸直
        def is_extended(tip, pip, wrist):
            """判断手指是否伸直"""
            return tip['y'] < pip['y']  # y越小越靠上
        
        fingers_extended = {
            'thumb': is_extended(thumb_tip, thumb_ip, wrist),
            'index': is_extended(index_tip, index_pip, wrist),
            'middle': is_extended(middle_tip, middle_pip, wrist),
            'ring': is_extended(ring_tip, ring_pip, wrist),
            'pinky': is_extended(pinky_tip, pinky_pip, wrist)
        }
        
        # 手势识别
        fingers = list(fingers_extended.values())
        
        if not any(fingers):
            return "Fist"  # 握拳
        elif fingers == [True, True, True, True, True]:
            return "Open"  # 张开
        elif fingers == [False, True, True, False, False]:
            return "Peace"  # 胜利
        elif fingers == [True, False, False, False, False]:
            return "Like"  # 点赞
        elif fingers == [False, True, False, False, False]:
            return "Point"  # 指向
        elif fingers == [False, True, True, True, False]:
            return "Three"  # 三
        else:
            return "Other"
 
def hand_tracker_demo():
    """手部追踪演示"""
    tracker = HandTracker(max_hands=2)
    
    cap = cv2.VideoCapture(0)
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break
        
        frame = cv2.flip(frame, 1)
        
        # 获取手部数据
        frame, hands = tracker.get_hand_landmarks(frame)
        
        # 识别手势
        for hand in hands:
            gesture = tracker.recognize_gesture(hand['landmarks'])
            
            # 显示手势名称
            pos = hand['landmarks'][0]  # 手腕位置
            cv2.putText(
                frame,
                f"{hand['handedness']}: {gesture}",
                (int(pos['x'] * frame.shape[1]), int(pos['y'] * frame.shape[0])),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (0, 255, 0),
                2
            )
        
        cv2.imshow('Hand Tracking', frame)
        
        if cv2.waitKey(5) & 0xFF == 27:
            break
    
    cap.release()
    cv2.destroyAllWindows()
 
if __name__ == "__main__":
    hand_tracker_demo()

2. OpenPose:更强大的开源方案

2.1 OpenPose简介

OpenPose是卡内基梅隆大学开源的姿态检测框架,比MediaPipe更专业,但安装也更复杂。

主要功能:

  • 身体关键点检测(25个点)
  • 手部关键点检测(21个点)
  • 面部关键点检测(70个点)
  • 支持多人检测

2.2 OpenPose安装(Ubuntu/Mac)

# 安装依赖
sudo apt-get install libopencv-dev python3-opencv
 
# 克隆仓库
git clone https://github.com/CMU-Perceptual-Computing-Lab/openpose.git
cd openpose
 
# 使用Docker(推荐,更简单)
docker pull g CEE(openpose:latest)
 
# 或者用Python版本(功能较少但安装简单)
pip install openpifpaf

2.3 OpenPose快速使用

# 摄像头实时检测
./build/examples/openpose/openpose.bin --camera 0
 
# 从视频文件检测
./build/examples/openpose/openpose.bin --video input_video.mp4
 
# 保存结果
./build/examples/openpose/openpose.bin \
    --video input_video.mp4 \
    --write_json output_json/ \
    --write_video output_video.avi

3. 表情捕捉:让数字人有表情

3.1 表情捕捉是什么?

光有身体动作还不够,数字人还要有表情才能”活”起来。表情捕捉就是检测人的面部表情,然后驱动数字人的表情系统。

3.2 ARKit表情捕捉(iOS设备)

如果你是iPhone或iPad用户,可以使用ARKit自带的Face Tracking功能:

import ARKit
import RealityKit
 
class FaceTracker: NSObject, ARSessionDelegate {
    var session: ARSession!
    
    func setupFaceTracking() {
        guard ARFaceTrackingConfiguration.isSupported else {
            print("设备不支持面部追踪")
            return
        }
        
        let configuration = ARFaceTrackingConfiguration()
        configuration.isLightEstimationEnabled = true
        session.run(configuration)
    }
    
    func session(_ session: ARSession, didUpdate anchors: [ARAnchor]) {
        guard let faceAnchor = anchors.first as? ARFaceAnchor else { return }
        
        // 获取表情系数(52个blend shapes)
        let blendShapes = faceAnchor.blendShapes
        
        // 提取关键表情
        let eyeBlinkLeft = blendShapes[.eyeBlinkLeft]?.doubleValue ?? 0
        let eyeBlinkRight = blendShapes[.eyeBlinkRight]?.doubleValue ?? 0
        let mouthSmileLeft = blendShapes[.mouthSmileLeft]?.doubleValue ?? 0
        let mouthSmileRight = blendShapes[.mouthSmileRight]?.doubleValue ?? 0
        let jawOpen = blendShapes[.jawOpen]?.doubleValue ?? 0
        let browDownLeft = blendShapes[.browDownLeft]?.doubleValue ?? 0
        let browDownRight = blendShapes[.browDownRight]?.doubleValue ?? 0
        
        // 打印数据
        print("左眼眨眼: \(eyeBlinkLeft)")
        print("右眼眨眼: \(eyeBlinkRight)")
        print("嘴巴张开: \(jawOpen)")
        print("微笑: \((mouthSmileLeft + mouthSmileRight) / 2)")
        
        // 这里可以把数据发送给数字人系统
        // sendToDigitalHuman(...)
    }
}

3.3 Python端接收ARKit数据

"""
使用Live Link Face接收iPhone的面部追踪数据
"""
import socket
import json
import numpy as np
 
class LiveLinkFaceReceiver:
    """
    通过UDP接收Live Link Face的数据
    """
    
    def __init__(self, port=11111):
        self.port = port
        self.socket = None
        self.running = False
        
        # ARKit Blend Shape索引映射到Live Link
        self.blend_shape_names = [
            'browDown_L', 'browDown_R', 'browInnerUp',
            'browOuterUp_L', 'browOuterUp_R',
            'eyeLookUp_L', 'eyeLookUp_R',
            'eyeLookDown_L', 'eyeLookDown_R',
            'eyeLookIn_L', 'eyeLookIn_R',
            'eyeLookOut_L', 'eyeLookOut_R',
            'eyeBlink_L', 'eyeBlink_R',
            'eyeSquint_L', 'eyeSquint_R',
            'eyeWide_L', 'eyeWide_R',
            'cheekPuff', 'cheekSquint_L', 'cheekSquint_R',
            'noseSneer_L', 'noseSneer_R',
            'jawOpen', 'jawForward', 'jawLeft', 'jawRight',
            'jawClench',
            'mouthFunnel', 'mouthPucker',
            'mouthLeft', 'mouthRight',
            'mouthSmile_L', 'mouthSmile_R',
            'mouthFrown_L', 'mouthFrown_R',
            'mouthDimple_L', 'mouthDimple_R',
            'mouthUpperUp_L', 'mouthUpperUp_R',
            'mouthLowerDown_L', 'mouthLowerDown_R',
            'mouthPress_L', 'mouthPress_R',
            'mouthStretch_L', 'mouthStretch_R',
            'tongueOut'
        ]
    
    def start(self):
        """开始接收"""
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        self.socket.bind(('', self.port))
        self.socket.settimeout(1.0)  # 1秒超时
        self.running = True
        print(f"监听端口 {self.port}...")
    
    def receive(self):
        """
        接收一帧数据
        
        Returns:
            blend shape值字典,如果超时返回None
        """
        if not self.running:
            return None
        
        try:
            data, _ = self.socket.recvfrom(4096)
            message = json.loads(data.decode('utf-8'))
            
            # 解析blend shapes
            blend_shapes = {}
            if 'blendShapes' in message:
                for i, bs in enumerate(message['blendShapes']):
                    name = self.blend_shape_names[i] if i < len(self.blend_shape_names) else f"bs_{i}"
                    blend_shapes[name] = bs
            
            return blend_shapes
        
        except socket.timeout:
            return None
        except Exception as e:
            print(f"接收错误: {e}")
            return None
    
    def stop(self):
        """停止接收"""
        self.running = False
        if self.socket:
            self.socket.close()
 
# 使用示例
if __name__ == "__main__":
    receiver = LiveLinkFaceReceiver()
    receiver.start()
    
    try:
        while True:
            blend_shapes = receiver.receive()
            if blend_shapes:
                print(f"眨眼: {blend_shapes.get('eyeBlink_L', 0):.2f}, {blend_shapes.get('eyeBlink_R', 0):.2f}")
                print(f"微笑: {blend_shapes.get('mouthSmile_L', 0):.2f}")
                print(f"张嘴: {blend_shapes.get('jawOpen', 0):.2f}")
    except KeyboardInterrupt:
        print("\n停止")
    finally:
        receiver.stop()

3.4 MediaPipe面部表情检测

"""
使用MediaPipe进行面部表情检测
"""
import cv2
import mediapipe as mp
import numpy as np
 
class FaceExpressionDetector:
    """
    面部表情检测器
    """
    
    def __init__(self):
        self.mp_face_mesh = mp.solutions.face_mesh
        self.face_mesh = self.mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,  # 启用眼部细节
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # 眼睛和嘴巴的关键点索引
        self.LEFT_EYE = [362, 385, 387, 263, 373, 380]
        self.RIGHT_EYE = [33, 160, 158, 133, 153, 144]
        self.MOUTH_OUTER = [61, 291, 0, 17, 269, 405]
        self.MOUTH_INNER = [78, 95, 88, 178, 87, 14, 317, 402]
        self.LEFT_BROW = [70, 63, 105, 66, 107]
        self.RIGHT_BROW = [336, 296, 334, 293, 300]
    
    def calculate_ear(self, landmarks, eye_indices):
        """
        计算眼睛纵横比(Eye Aspect Ratio)
        用于检测眨眼
        """
        def distance(p1, p2):
            return np.sqrt((p1.x - p2.x)**2 + (p1.y - p2.y)**2)
        
        # 计算眼睛的垂直距离
        # 上眼睑到下眼睑
        top = (landmarks[eye_indices[1]].y + landmarks[eye_indices[2]].y) / 2
        bottom = (landmarks[eye_indices[4]].y + landmarks[eye_indices[5]].y) / 2
        
        # 眼睑宽度
        width = distance(landmarks[eye_indices[0]], landmarks[eye_indices[3]])
        
        ear = (top - bottom) / width if width > 0 else 0
        return ear
    
    def calculate_mar(self, landmarks):
        """
        计算嘴巴纵横比(Mouth Aspect Ratio)
        用于检测张嘴
        """
        def distance(p1, p2):
            return np.sqrt((p1.x - p2.x)**2 + (p1.y - p2.y)**2)
        
        # 嘴巴高度
        top = landmarks[13].y  # 上唇中点
        bottom = landmarks[14].y  # 下唇中点
        height = abs(top - bottom)
        
        # 嘴巴宽度
        left = landmarks[61].x
        right = landmarks[291].x
        width = abs(right - left)
        
        mar = height / width if width > 0 else 0
        return mar
    
    def detect(self, frame):
        """
        检测面部表情
        
        Returns:
            表情数据字典
        """
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = self.face_mesh.process(image_rgb)
        
        expression_data = {
            'blink_left': 0,
            'blink_right': 0,
            'mouth_open': 0,
            'smile': 0,
            'brow_raise': 0
        }
        
        if results.multi_face_landmarks:
            landmarks = results.multi_face_landmarks[0].landmark
            
            # 计算EAR(眼睛纵横比)
            ear_left = self.calculate_ear(landmarks, self.LEFT_EYE)
            ear_right = self.calculate_ear(landmarks, self.RIGHT_EYE)
            
            # EAR小于阈值表示眨眼
            BLINK_THRESHOLD = 0.2
            expression_data['blink_left'] = 1.0 if ear_left < BLINK_THRESHOLD else 0.0
            expression_data['blink_right'] = 1.0 if ear_right < BLINK_THRESHOLD else 0.0
            
            # 计算MAR(嘴巴纵横比)
            mar = self.calculate_mar(landmarks)
            expression_data['mouth_open'] = mar * 2  # 归一化
            
            # 计算微笑(嘴角上扬程度)
            mouth_left_y = landmarks[61].y
            mouth_right_y = landmarks[291].y
            # 微笑时嘴角会上扬
            avg_mouth_y = (mouth_left_y + mouth_right_y) / 2
            # 基准线(静止时的位置,需要校准)
            baseline = 0.58
            expression_data['smile'] = max(0, (baseline - avg_mouth_y) * 5)
            
            # 眉毛上扬
            brow_left_y = (landmarks[70].y + landmarks[63].y) / 2
            brow_right_y = (landmarks[336].y + landmarks[296].y) / 2
            brow_avg = (brow_left_y + brow_right_y) / 2
            expression_data['brow_raise'] = max(0, (0.44 - brow_avg) * 10)
        
        return expression_data
 
def face_expression_demo():
    """面部表情检测演示"""
    detector = FaceExpressionDetector()
    
    cap = cv2.VideoCapture(0)
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break
        
        frame = cv2.flip(frame, 1)
        
        # 检测表情
        expression = detector.detect(frame)
        
        # 显示表情数据
        texts = [
            f"眨眼(L): {expression['blink_left']:.0f}",
            f"眨眼(R): {expression['blink_right']:.0f}",
            f"张嘴: {expression['mouth_open']:.2f}",
            f"微笑: {expression['smile']:.2f}",
            f"挑眉: {expression['brow_raise']:.2f}"
        ]
        
        for i, text in enumerate(texts):
            cv2.putText(
                frame, text, (10, 30 + i * 25),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6, (0, 255, 0), 2
            )
        
        cv2.imshow('Face Expression', frame)
        
        if cv2.waitKey(5) & 0xFF == 27:
            break
    
    cap.release()
    cv2.destroyAllWindows()
 
if __name__ == "__main__":
    face_expression_demo()

4. 骨骼动画基础:理解数字人是如何动起来的

4.1 骨骼系统是什么?

数字人的”骨骼”跟真实人的骨骼类似,是一个层级结构:

  • 髋关节是根节点
  • 脊柱连接躯干
  • 肩膀连接手臂
  • 膝盖连接小腿

每个骨骼都有位置旋转属性,通过改变这些属性,就能让数字人做出各种动作。

    头部(head)
       ↑
    颈部(neck)
       ↑
    胸部(chest)
     ↙ ↘
  左肩   右肩
    ↘   ↙
   左臂   右臂
     ↘ ↙
   前臂   前臂
     ↘ ↙
    手腕   手腕
       ↑
      手

4.2 骨骼动画的核心概念

FK(正向运动学)

  • 从根节点开始,逐级往下控制
  • 适合固定动作的录制

IK(逆运动学)

  • 指定末端(比如手)的位置,自动计算中间关节的角度
  • 适合实时交互场景
"""
简单的IK(逆运动学)实现
"""
import numpy as np
import math
 
class SimpleIK:
    """
    2D逆运动学求解器(FABRIK算法)
    """
    
    def __init__(self, joints, bone_lengths):
        """
        Args:
            joints: 关节位置列表 [[x,y], ...]
            bone_lengths: 各段长度列表 [len1, len2, ...]
        """
        self.joints = np.array(joints, dtype=float)
        self.bone_lengths = np.array(bone_lengths)
        self.total_length = sum(bone_lengths)
    
    def solve(self, target, max_iterations=10, tolerance=0.01):
        """
        求解IK
        
        Args:
            target: 目标位置 [x, y]
            max_iterations: 最大迭代次数
            tolerance: 收敛阈值
        
        Returns:
            求解后的关节位置
        """
        target = np.array(target)
        
        # 保存根节点位置
        root = self.joints[0].copy()
        
        # 检查目标是否可达
        dist_to_target = np.linalg.norm(target - root)
        if dist_to_target > self.total_length:
            # 目标不可达,向目标方向伸展
            self.stretch_towards(target)
        else:
            # FABRIK迭代
            for _ in range(max_iterations):
                # 后向递推:把末端拉向目标
                self.joints[-1] = target
                for i in range(len(self.joints) - 2, -1, -1):
                    direction = self.joints[i + 1] - self.joints[i]
                    direction = direction / np.linalg.norm(direction)
                    self.joints[i] = (
                        self.joints[i + 1] - 
                        direction * self.bone_lengths[i]
                    )
                
                # 前向递推:把根节点移回原位
                self.joints[0] = root
                for i in range(len(self.joints) - 1):
                    direction = self.joints[i + 1] - self.joints[i]
                    direction = direction / np.linalg.norm(direction)
                    self.joints[i + 1] = (
                        self.joints[i] + 
                        direction * self.bone_lengths[i]
                    )
                
                # 检查收敛
                if np.linalg.norm(self.joints[-1] - target) < tolerance:
                    break
        
        return self.joints.tolist()
    
    def stretch_towards(self, target):
        """目标不可达时,向目标方向伸展"""
        self.joints[0] = target
        for i in range(len(self.joints) - 1):
            direction = self.joints[i] - self.joints[i + 1]
            direction = direction / np.linalg.norm(direction)
            self.joints[i + 1] = (
                self.joints[i] - 
                direction * self.bone_lengths[i]
            )
        self.joints = self.joints[::-1]
 
# 使用示例
if __name__ == "__main__":
    # 定义一个3段的手臂
    joints = [[0, 0], [0, 50], [0, 100], [0, 150]]  # 4个关节
    bone_lengths = [50, 50, 50]  # 3段,每段50单位长
    
    ik = SimpleIK(joints, bone_lengths)
    
    # 设置目标位置(手臂末端要去的地方)
    target = [30, 140]
    
    # 求解
    result = ik.solve(target)
    
    print("原始位置:", joints)
    print("求解后:", result)
    print("目标:", target)

5. 将动捕数据应用到数字人

5.1 数据格式转换

动捕系统输出的数据格式各异,需要转换成数字人能用的格式。常见的格式有:

格式说明适用软件
BVHBiovision Hierarchy,纯文本通用
FBXAutodesk格式,二进制Unity、UE、Maya
SKELUnity骨骼格式Unity
JSON自定义格式Web应用
"""
将MediaPipe数据转换为BVH格式
"""
import json
 
class BVHExporter:
    """
    BVH格式导出器
    """
    
    # BVH骨骼层级定义
    HIERARCHY = """
HIERARCHY
ROOT hip
{
    OFFSET 0.00 85.00 0.00
    CHANNELS 6 Xposition Yposition Zposition Zrotation Yrotation Xrotation
    JOINT spine
    {
        OFFSET 0.00 10.00 0.00
        CHANNELS 3 Zrotation Yrotation Xrotation
        JOINT chest
        {
            OFFSET 0.00 20.00 0.00
            CHANNELS 3 Zrotation Yrotation Xrotation
            JOINT neck
            {
                OFFSET 0.00 15.00 0.00
                CHANNELS 3 Zrotation Yrotation Xrotation
                JOINT head
                {
                    OFFSET 0.00 10.00 0.00
                    CHANNELS 3 Zrotation Yrotation Xrotation
                }
            }
            JOINT left_shoulder
            {
                OFFSET 15.00 10.00 0.00
                CHANNELS 3 Zrotation Yrotation Xrotation
                JOINT left_elbow
                {
                    OFFSET 30.00 0.00 0.00
                    CHANNELS 3 Zrotation Yrotation Xrotation
                    JOINT left_wrist
                    {
                        OFFSET 25.00 0.00 0.00
                        CHANNELS 3 Zrotation Yrotation Xrotation
                    }
                }
            }
            JOINT right_shoulder
            {
                OFFSET -15.00 10.00 0.00
                CHANNELS 3 Zrotation Yrotation Xrotation
                JOINT right_elbow
                {
                    OFFSET -30.00 0.00 0.00
                    CHANNELS 3 Zrotation Yrotation Xrotation
                    JOINT right_wrist
                    {
                        OFFSET -25.00 0.00 0.00
                        CHANNELS 3 Zrotation Yrotation Xrotation
                    }
                }
            }
        }
    }
    JOINT left_hip
    {
        OFFSET 10.00 -10.00 0.00
        CHANNELS 3 Zrotation Yrotation Xrotation
        JOINT left_knee
        {
            OFFSET 0.00 -40.00 0.00
            CHANNELS 3 Zrotation Yrotation Xrotation
            JOINT left_ankle
            {
                OFFSET 0.00 -40.00 0.00
                CHANNELS 3 Zrotation Yrotation Xrotation
            }
        }
    }
    JOINT right_hip
    {
        OFFSET -10.00 -10.00 0.00
        CHANNELS 3 Zrotation Yrotation Xrotation
        JOINT right_knee
        {
            OFFSET 0.00 -40.00 0.00
            CHANNELS 3 Zrotation Yrotation Xrotation
            JOINT right_ankle
            {
                OFFSET 0.00 -40.00 0.00
                CHANNELS 3 Zrotation Yrotation Xrotation
            }
        }
    }
}
"""
    
    def __init__(self):
        self.frame_time = 1 / 30  # 30fps
    
    def export(self, skeleton_data, output_path):
        """
        导出为BVH文件
        
        Args:
            skeleton_data: 骨架数据列表
            output_path: 输出文件路径
        """
        with open(output_path, 'w') as f:
            # 写入骨骼层级
            f.write(self.HIERARCHY)
            f.write("\n")
            
            # 写入MOTION部分
            f.write("MOTION\n")
            f.write(f"Frames: {len(skeleton_data)}\n")
            f.write(f"Frame Time: {self.frame_time}\n")
            
            # 写入每一帧的数据
            for frame_data in skeleton_data:
                motion_values = self.skeleton_to_motion(frame_data)
                f.write(" ".join([f"{v:.4f}" for v in motion_values]) + "\n")
    
    def skeleton_to_motion(self, frame_data):
        """
        将骨架数据转换为运动数据
        """
        landmarks = frame_data.get('landmarks', {})
        
        # 这里需要根据实际骨骼映射关系来转换
        # 这是一个简化版本
        
        values = []
        
        # 髋关节(根节点)
        if 'left_hip' in landmarks and 'right_hip' in landmarks:
            hip_x = (landmarks['left_hip']['x'] + landmarks['right_hip']['x']) / 2
            hip_y = (landmarks['left_hip']['y'] + landmarks['right_hip']['y']) / 2
            hip_z = (landmarks['left_hip'].get('z', 0) + landmarks['right_hip'].get('z', 0)) / 2
            
            # 转换为骨骼旋转(简化处理)
            values.extend([hip_x * 100, hip_y * 100, hip_z * 100, 0, 0, 0])
        
        # 其他关节的旋转(简化处理)
        # 实际应用中需要更复杂的计算
        for _ in range(60):  # 简化:填充60个旋转值
            values.append(0)
        
        return values[:66]  # 确保有66个值(BVH标准)
 
# 使用示例
if __name__ == "__main__":
    # 假设你已经有了skeleton_data
    # skeleton_data = extractor.extract_from_video('input.mp4')
    
    exporter = BVHExporter()
    # exporter.export(skeleton_data, 'output.bvh')
    print("BVH导出器就绪")

5.2 Unity中应用动捕数据

// Unity中接收外部骨架数据并驱动角色
using UnityEngine;
using System.Collections.Generic;
 
public class SkeletonDriver : MonoBehaviour
{
    // 骨骼映射
    public Transform hip;
    public Transform spine;
    public Transform chest;
    public Transform neck;
    public Transform head;
    public Transform leftShoulder;
    public Transform leftElbow;
    public Transform leftWrist;
    public Transform rightShoulder;
    public Transform rightElbow;
    public Transform rightWrist;
    
    // 动画曲线平滑
    private Dictionary<string, float> smoothValues = new Dictionary<string, float>();
    
    void Start()
    {
        // 初始化平滑值
        smoothValues["hip_x"] = 0;
        smoothValues["hip_y"] = 0;
        smoothValues["spine_rot"] = 0;
        // ...
    }
    
    public void ApplySkeletonData(string jsonData)
    {
        // 解析JSON数据
        var data = JsonUtility.FromJson<SkeletonData>(jsonData);
        
        if (data == null) return;
        
        // 应用到骨骼
        ApplyRotation(leftElbow, data.leftElbowAngle, "left_elbow");
        ApplyRotation(rightElbow, data.rightElbowAngle, "right_elbow");
        
        ApplyRotation(neck, data.neckAngle, "neck");
        ApplyRotation(head, data.headAngle, "head");
        
        // 位置处理
        ApplyPosition(hip, data.hipPosition);
    }
    
    void ApplyRotation(Transform bone, Vector3 rotation, string key)
    {
        if (bone == null) return;
        
        // 平滑过渡
        if (!smoothValues.ContainsKey(key))
            smoothValues[key] = 0;
        
        float target = rotation.x;
        smoothValues[key] = Mathf.Lerp(smoothValues[key], target, 0.3f);
        
        bone.localEulerAngles = new Vector3(
            smoothValues[key],
            bone.localEulerAngles.y,
            bone.localEulerAngles.z
        );
    }
    
    void ApplyPosition(Transform bone, Vector3 position)
    {
        if (bone == null) return;
        
        // 平滑移动
        bone.localPosition = Vector3.Lerp(
            bone.localPosition,
            position * 100,  // 缩放
            0.3f
        );
    }
}
 
[System.Serializable]
public class SkeletonData
{
    public Vector3 hipPosition;
    public Vector3 leftElbowAngle;
    public Vector3 rightElbowAngle;
    public Vector3 neckAngle;
    public Vector3 headAngle;
}

6. 实时动作捕捉系统搭建

6.1 系统架构

┌─────────────┐     ┌─────────────┐     ┌─────────────┐
│   摄像头    │────▶│  MediaPipe  │────▶│   骨架数据   │
│  (RGB/D415) │     │  姿态检测   │     │   (JSON)    │
└─────────────┘     └─────────────┘     └──────┬──────┘
                                                │
                   ┌─────────────┐     ┌────────▼────────┐
                   │  WebSocket │────▶│   数字人渲染    │
                   │   服务器    │     │  (Unity/UE/Web) │
                   └─────────────┘     └─────────────────┘

6.2 完整代码示例

"""
实时动作捕捉系统
摄像头 → MediaPipe → WebSocket → Unity/WebGL
"""
import cv2
import mediapipe as mp
import asyncio
import websockets
import json
import threading
import numpy as np
 
class RealTimeMoCap:
    """
    实时动作捕捉系统
    """
    
    def __init__(self, websocket_url=None):
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True
        )
        
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=2,
            model_complexity=1
        )
        
        self.websocket_url = websocket_url
        self.ws = None
        self.running = False
        self.smoothing = 0.7  # 平滑系数
        self.prev_landmarks = None
        
        # 映射到标准骨骼
        self.skeleton_mapping = {
            'hips': 23,  # 左髋
            'spine': 24,  # 左肩
            'chest': 12,  # 右肩
            'neck': 11,   # 颈部
            'head': 0,    # 鼻子
            'left_shoulder': 11,
            'left_elbow': 13,
            'left_wrist': 15,
            'right_shoulder': 12,
            'right_elbow': 14,
            'right_wrist': 16,
            'left_hip': 23,
            'left_knee': 25,
            'left_ankle': 27,
            'right_hip': 24,
            'right_knee': 26,
            'right_ankle': 28,
        }
    
    async def connect_websocket(self):
        """连接到WebSocket服务器"""
        if self.websocket_url:
            self.ws = await websockets.connect(self.websocket_url)
            print(f"已连接到 {self.websocket_url}")
    
    async def send_data(self, data):
        """发送数据到服务器"""
        if self.ws:
            await self.ws.send(json.dumps(data))
    
    def process_frame(self, frame):
        """处理单帧"""
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # 姿态检测
        pose_results = self.pose.process(image_rgb)
        
        # 手部检测
        hand_results = self.hands.process(image_rgb)
        
        skeleton_data = None
        
        if pose_results.pose_landmarks:
            # 提取骨架数据
            skeleton_data = self.extract_skeleton(pose_results.pose_landmarks)
            
            # 平滑处理
            skeleton_data = self.smooth_skeleton(skeleton_data)
            
            # 绘制骨架
            self.draw_skeleton(frame, pose_results.pose_landmarks)
        
        # 绘制手部
        if hand_results.multi_hand_landmarks:
            for hand_landmarks in hand_results.multi_hand_landmarks:
                self.mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    self.mp_hands.HAND_CONNECTIONS
                )
        
        return frame, skeleton_data
    
    def extract_skeleton(self, landmarks):
        """提取骨架数据"""
        data = {}
        
        for name, idx in self.skeleton_mapping.items():
            landmark = landmarks[idx]
            data[name] = {
                'x': landmark.x,
                'y': landmark.y,
                'z': landmark.z,
                'visibility': landmark.visibility
            }
        
        return data
    
    def smooth_skeleton(self, current):
        """骨架数据平滑"""
        if self.prev_landmarks is None:
            self.prev_landmarks = current
            return current
        
        smoothed = {}
        for key in current.keys():
            if key in self.prev_landmarks:
                prev = self.prev_landmarks[key]
                curr = current[key]
                
                smoothed[key] = {
                    'x': prev['x'] * self.smoothing + curr['x'] * (1 - self.smoothing),
                    'y': prev['y'] * self.smoothing + curr['y'] * (1 - self.smoothing),
                    'z': prev['z'] * self.smoothing + curr['z'] * (1 - self.smoothing),
                    'visibility': curr['visibility']
                }
            else:
                smoothed[key] = current[key]
        
        self.prev_landmarks = smoothed
        return smoothed
    
    def draw_skeleton(self, frame, landmarks):
        """绘制骨架"""
        mp_drawing = mp.solutions.drawing_utils
        mp_pose = mp.solutions.pose
        
        # 绘制连接线
        for connection in mp_pose.POSE_CONNECTIONS:
            start_idx, end_idx = connection
            start = landmarks[start_idx]
            end = landmarks[end_idx]
            
            h, w, _ = frame.shape
            start_point = (int(start.x * w), int(start.y * h))
            end_point = (int(end.x * w), int(end.y * h))
            
            cv2.line(frame, start_point, end_point, (0, 255, 0), 2)
        
        # 绘制关键点
        for landmark in landmarks:
            h, w, _ = frame.shape
            point = (int(landmark.x * w), int(landmark.y * h))
            cv2.circle(frame, point, 4, (0, 0, 255), -1)
    
    async def run(self):
        """运行主循环"""
        cap = cv2.VideoCapture(0)
        self.running = True
        
        if self.websocket_url:
            await self.connect_websocket()
        
        while self.running and cap.isOpened():
            success, frame = cap.read()
            if not success:
                break
            
            # 水平翻转
            frame = cv2.flip(frame, 1)
            
            # 处理帧
            frame, skeleton_data = self.process_frame(frame)
            
            # 显示
            cv2.imshow('Real-Time MoCap', frame)
            
            # 发送数据
            if skeleton_data:
                await self.send_data({
                    'type': 'skeleton',
                    'data': skeleton_data
                })
            
            # 按ESC退出
            if cv2.waitKey(5) & 0xFF == 27:
                self.running = False
                break
        
        cap.release()
        cv2.destroyAllWindows()
        
        if self.ws:
            await self.ws.close()
    
    def stop(self):
        """停止"""
        self.running = False
 
async def main():
    # 创建动捕系统(连接到本地Unity)
    mocap = RealTimeMoCap(websocket_url="ws://localhost:8765")
    await mocap.run()
 
if __name__ == "__main__":
    asyncio.run(main())

7. 常见问题与解决方案

Q1: 摄像头检测不准确怎么办?

  1. 光线问题:确保环境光线均匀,不要背光
  2. 距离问题:保持适当距离(1.5-3米最佳)
  3. 遮挡问题:尽量让全身都在画面中
  4. 参数调整:增大min_detection_confidence

Q2: 动作抖动怎么解决?

  1. 开启平滑smooth_landmarks=True
  2. 增加后处理:使用移动平均或卡尔曼滤波
  3. 降低帧率:从60fps降到30fps

Q3: 延迟太高怎么办?

  1. 降低模型复杂度model_complexity=1代替2
  2. 减小输入分辨率:resize到640x480
  3. 使用GPU:如果支持的话

Q4: 怎么让数字人动作更自然?

  1. 动作混合:不要直接替换,渐变过渡
  2. 添加待机动画:静止时也要有呼吸等微动
  3. 随机微动作:偶尔的眨眼、头部微动

相关文档


更新日志

日期版本修改内容
2026-04-18v1.0初版完成
2026-04-24v1.1深度改写,增加详细实操代码